From 2986dc21201fe1a687badd62d2be667d6b335ffe Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 3 Nov 2024 10:48:55 +0500 Subject: [PATCH 001/151] implement config and model building blocks --- .../depth_pro/configuration_depth_pro.py | 167 ++ .../models/depth_pro/modeling_depth_pro.py | 1404 +++++++++++++++++ 2 files changed, 1571 insertions(+) create mode 100644 src/transformers/models/depth_pro/configuration_depth_pro.py create mode 100644 src/transformers/models/depth_pro/modeling_depth_pro.py diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py new file mode 100644 index 000000000000..ad0f1016f7a1 --- /dev/null +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -0,0 +1,167 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""DepthPro model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from transformers.configuration_utils import PretrainedConfig +from transformers.onnx import OnnxConfig +from transformers.utils import logging +from transformers.utils.backbone_utils import get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class DepthProConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`DepthProModel`]. It is used to instantiate a + DepthPro model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the DepthPro + [apple/DepthPro](https://huggingface.co/apple/DepthPro) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + mlp_ratio (`int`, *optional*, defaults to 4): + Ratio of the hidden size of the MLPs relative to the `hidden_size`. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + layerscale_value (`float`, *optional*, defaults to 1.0): + Initial value to use for layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_swiglu_ffn (`bool`, *optional*, defaults to `False`): + Whether to use the SwiGLU feedforward neural network. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + apply_layernorm (`bool`, *optional*, defaults to `True`): + Whether to apply layer normalization to the feature maps in case the model is used as backbone. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. + + Example: + + ```python + >>> from transformers import DepthProConfig, DepthProModel + + >>> # Initializing a DepthPro apple/DepthPro style configuration + >>> configuration = DepthProConfig() + + >>> # Initializing a model (with random weights) from the apple/DepthPro style configuration + >>> model = DepthProModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "depth_pro" + + def __init__( + self, + hidden_size=1024, # changed + decoder_hidden_size=256, + num_hidden_layers=24, # changed + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=384, + patch_size=16, # changed + num_channels=3, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + out_features=None, + out_indices=None, + apply_layernorm=True, + reshape_hidden_states=True, + patch_encoder_hook_ids = [5, 11], + # patch_encoder_hook_ids = [5, 11, 17, 23], + patch_encoder_feature_dims = [256, 512, 1024, 1024], + use_batch_norm_in_decoder=False, + use_fov=False, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.decoder_hidden_size = decoder_hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_ratio = mlp_ratio + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias + self.layerscale_value = layerscale_value + self.drop_path_rate = drop_path_rate + self.use_swiglu_ffn = use_swiglu_ffn + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.apply_layernorm = apply_layernorm + self.reshape_hidden_states = reshape_hidden_states + self.patch_encoder_hook_ids = patch_encoder_hook_ids + self.patch_encoder_feature_dims = patch_encoder_feature_dims + self.use_batch_norm_in_decoder = use_batch_norm_in_decoder + self.use_fov = use_fov diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py new file mode 100644 index 000000000000..f73b74060f57 --- /dev/null +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -0,0 +1,1404 @@ +# coding=utf-8 +# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch DepthPro model.""" + +from icecream import ic + +import collections.abc +import math +from typing import Dict, List, Optional, Set, Tuple, Union + +import torch +from torch import nn +from dataclasses import dataclass + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, +) +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from .configuration_depth_pro import DepthProConfig + + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT +class DepthProViTPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings +# with DepthProViT->DepthProViT and antialias=True in interpolation +class DepthProViTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + """ + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.patch_embeddings = DepthProViTPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + target_dtype = patch_pos_embed.dtype + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.to(torch.float32), + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings + ).to(dtype=target_dtype) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embeddings.projection.weight.dtype + embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) + + # add the [CLS] token to the embedded patch tokens + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + # add positional encoding to each token + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT +class DepthProViTSelfAttention(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size,} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT +class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): + def __init__(self, config: DepthProConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT +class DepthProViTSelfOutput(nn.Module): + """ + The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT +class DepthProViTAttention(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.attention = DepthProViTSelfAttention(config) + self.output = DepthProViTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT +class DepthProViTSdpaAttention(DepthProViTAttention): + def __init__(self, config: DepthProConfig) -> None: + super().__init__(config) + self.attention = DepthProViTSdpaSelfAttention(config) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT +class DepthProViTLayerScale(nn.Module): + def __init__(self, config) -> None: + super().__init__() + self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size)) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + return hidden_state * self.lambda1 + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath +class DepthProViTDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT +class DepthProViTMLP(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + self.fc1 = nn.Linear(in_features, hidden_features, bias=True) + if isinstance(config.hidden_act, str): + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = config.hidden_act + self.fc2 = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.fc1(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.fc2(hidden_state) + return hidden_state + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT +class DepthProViTSwiGLUFFN(nn.Module): + def __init__(self, config) -> None: + super().__init__() + in_features = out_features = config.hidden_size + hidden_features = int(config.hidden_size * config.mlp_ratio) + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + + self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True) + self.weights_out = nn.Linear(hidden_features, out_features, bias=True) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + hidden_state = self.weights_in(hidden_state) + x1, x2 = hidden_state.chunk(2, dim=-1) + hidden = nn.functional.silu(x1) * x2 + return self.weights_out(hidden) + + +DEPTHPROVIT_ATTENTION_CLASSES = { + "eager": DepthProViTAttention, + "sdpa": DepthProViTSdpaAttention, +} + + +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT +class DepthProViTLayer(nn.Module): + """This corresponds to the Block class in the original implementation.""" + + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + + self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = DEPTHPROVIT_ATTENTION_CLASSES[config._attn_implementation](config) + self.layer_scale1 = DepthProViTLayerScale(config) + self.drop_path = DepthProViTDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + + self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.use_swiglu_ffn: + self.mlp = DepthProViTSwiGLUFFN(config) + else: + self.mlp = DepthProViTMLP(config) + self.layer_scale2 = DepthProViTLayerScale(config) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.norm1(hidden_states), # in DepthProViT, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + attention_output = self.layer_scale1(attention_output) + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in DepthProViT, layernorm is also applied after self-attention + layer_output = self.norm2(hidden_states) + layer_output = self.mlp(layer_output) + layer_output = self.layer_scale2(layer_output) + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT +class DepthProViTEncoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class DepthProViT(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.embeddings = DepthProViTEmbeddings(config) + self.encoder = DepthProViTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + if not return_dict: + head_outputs = (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + + self.out_size = 24 # TODO: image_size // patch_size + + # patch encoder + self.patch_encoder = DepthProViT(config) + self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook( + self._intermediate0_hook + ) + self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook( + self._intermediate1_hook + ) + + # image encoder + self.image_encoder = DepthProViT(config) + + # upsampling features (1-2) + self.upsample_intermediate0 = self._create_project_upsample_block( + input_dims=config.hidden_size, + intermediate_dims=config.patch_encoder_feature_dims[0], + output_dims=config.decoder_hidden_size, + n_upsample_layers=3, + ) + self.upsample_intermediate1 = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[0], + n_upsample_layers=2, + ) + + # upsampling features (3-5) + self.upsample_high_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[1], + n_upsample_layers=1, + ) + self.upsample_med_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[2], + n_upsample_layers=1, + ) + self.upsample_low_res = self._create_project_upsample_block( + input_dims=config.hidden_size, + output_dims=config.patch_encoder_feature_dims[3], + n_upsample_layers=1, + ) + + # upsampling features (6) + self.upsample_image = nn.ConvTranspose2d( + in_channels=config.hidden_size, + out_channels=config.patch_encoder_feature_dims[3], + kernel_size=2, + stride=2, + padding=0, + bias=True, + ) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]), + out_channels=config.patch_encoder_feature_dims[3], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + + def _intermediate0_hook(self, model, input, output): + self.intermediate0_hidden_states = output[0] + + def _intermediate1_hook(self, model, input, output): + self.intermediate1_hidden_states = output[0] + + def _create_project_upsample_block( + self, + input_dims: int, + output_dims: int, + n_upsample_layers: int, + intermediate_dims: Optional[int] = None, + ) -> nn.Module: + + intermediate_dims = intermediate_dims or output_dims + + # Projection block followed by upsampling blocks. + blocks = [ + nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False) + ] + [ + nn.ConvTranspose2d( + in_channels=(intermediate_dims if i == 0 else output_dims), + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=False + ) for i in range(n_upsample_layers) + ] + + return nn.Sequential(*blocks) + + def _interpolate(self, pixel_values, scale_factor): + return nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=scale_factor, + mode="bilinear", + align_corners=False, + ) + + def _patch(self, pixel_values, overlap_ratio): + patch_size = 384 # TODO: this should be infered + patch_stride = int(patch_size * (1 - overlap_ratio)) + + image_size = pixel_values.shape[-1] + steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1 + + x_patch_list = [] + for j in range(steps): + j0 = j * patch_stride + j1 = j0 + patch_size + + for i in range(steps): + i0 = i * patch_stride + i1 = i0 + patch_size + x_patch_list.append(pixel_values[..., j0:j1, i0:i1]) + + return torch.cat(x_patch_list, dim=0) + + def _reshape_feature( + self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 + ): + """Discard class token and reshape 1D feature map to a 2D grid.""" + b, hw, c = hidden_states.shape + + # Remove class token. + if cls_token_offset > 0: + hidden_states = hidden_states[:, cls_token_offset:, :] + + # Shape: (batch, height, width, dim) -> (batch, dim, height, width) + hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) + return hidden_states + + def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor: + """Merge the patched input into a image with sliding window.""" + steps = int(math.sqrt(x.shape[0] // batch_size)) + + idx = 0 + + output_list = [] + for j in range(steps): + output_row_list = [] + for i in range(steps): + output = x[batch_size * idx : batch_size * (idx + 1)] + + if j != 0: + output = output[..., padding:, :] + if i != 0: + output = output[..., :, padding:] + if j != steps - 1: + output = output[..., :-padding, :] + if i != steps - 1: + output = output[..., :, :-padding] + + output_row_list.append(output) + idx += 1 + + output_row = torch.cat(output_row_list, dim=-1) + output_list.append(output_row) + output = torch.cat(output_list, dim=-2) + return output + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.shape[0] + + # STEP 1: create 3-level image + + high_res = pixel_values + med_res = self._interpolate(pixel_values, 0.5) + low_res = self._interpolate(pixel_values, 0.25) + + # STEP 2: create patches + + high_res_patches = self._patch(high_res, 0.25) + med_res_patches = self._patch(med_res, 0.5) + low_res_patches = low_res + + patches = torch.cat( + (high_res_patches, med_res_patches, low_res_patches), + dim=0, + ) + + # STEP 3: apply patch encoder + + patch_encodings = self.patch_encoder( + patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + patch_features = patch_encodings[0] + patch_features = self._reshape_feature( + patch_features, self.out_size, self.out_size + ) + + # STEP 4: Get Intermediate Features (features 1 and 2) + + intermediate0_features = self._reshape_feature( + self.intermediate0_hidden_states, + self.out_size, + self.out_size, + ) + intermediate1_features = self._reshape_feature( + self.intermediate1_hidden_states, + self.out_size, + self.out_size, + ) + intermediate0_features = self._merge( + intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + intermediate1_features = self._merge( + intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + + # STEP 5: Get Patch Encoder Features (features 3-5) + + high_res_features, med_res_features, low_res_features = torch.split( + patch_features, + [len(high_res_patches), len(med_res_patches), len(low_res_patches)], + dim=0, + ) + + high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) + med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) + low_res_features = low_res_features + + # STEP 6: Get Image Encoder Features (features 6) + + image_encodings = self.image_encoder( + pixel_values=low_res_patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + image_features = image_encodings[0] + image_features = self._reshape_feature( + image_features, self.out_size, self.out_size + ) + + # STEP 7: Upsample All Features (feature 1-6) + + # feature (1-2) + intermediate0_features = self.upsample_intermediate0( + intermediate0_features + ) + intermediate1_features = self.upsample_intermediate1( + intermediate1_features + ) + + # feature (3-5) + high_res_features = self.upsample_high_res(high_res_features) + med_res_features = self.upsample_med_res(med_res_features) + low_res_features = self.upsample_low_res(low_res_features) + + # feature (6) + image_features = self.upsample_image(image_features) + image_features = self.fuse_image_with_low_res( + torch.cat((low_res_features, image_features), dim=1) + ) + + last_hidden_state = [ + intermediate0_features, + intermediate1_features, + high_res_features, + med_res_features, + # low_res_features, + image_features, # fused with low_res_features + ] + + hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None + attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None + + if not return_dict: + return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=hidden_states, + attentions=attentions, + ) + + +class DepthProFOVModel(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size + + self.encoder = DepthProViT(config) + self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.low_res_neck = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(True) + ) + self.head = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + ) + + def forward( + self, + pixel_values: torch.Tensor, + low_res_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + pixel_values = nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=0.25, + mode="bilinear", + align_corners=False, + ) + encoder_outputs = self.encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = encoder_outputs[0] + + image_features = self.encoder_neck(image_features) + + # TODO: add some comments + image_features = image_features[:, 1:] + image_features = image_features.permute(0, 2, 1) + + low_res_features = self.low_res_neck(low_res_features) + + image_features = image_features.reshape_as(low_res_features) + image_features = image_features + low_res_features + fov_output = self.head(image_features) + fov_output = fov_output.reshape(1) + + if not return_dict: + head_outputs = (fov_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=fov_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro +class DepthProResidualLayer(nn.Module): + def __init__(self, config): + super().__init__() + + self.use_batch_norm = config.use_batch_norm_in_decoder + self.hidden_size = config.decoder_hidden_size + + self.activation1 = nn.ReLU() + self.convolution1 = nn.Conv2d( + self.hidden_size, + self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=(not self.use_batch_norm), + ) + + self.activation2 = nn.ReLU() + self.convolution2 = nn.Conv2d( + self.hidden_size, + self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=(not self.use_batch_norm), + ) + + if self.use_batch_norm: + self.batch_norm1 = nn.BatchNorm2d(self.hidden_size) + self.batch_norm2 = nn.BatchNorm2d(self.hidden_size) + + def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: + residual = hidden_state + hidden_state = self.activation1(hidden_state) + + hidden_state = self.convolution1(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm1(hidden_state) + + hidden_state = self.activation2(hidden_state) + hidden_state = self.convolution2(hidden_state) + + if self.use_batch_norm: + hidden_state = self.batch_norm2(hidden_state) + + return hidden_state + residual + + +# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +class DepthProFeatureFusionLayer(nn.Module): + def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: + super().__init__() + self.config = config + self.use_deconv = use_deconv + + self.residual_layer1 = DepthProResidualLayer(config) + self.residual_layer2 = DepthProResidualLayer(config) + + if self.use_deconv: + self.deconv = nn.ConvTranspose2d( + in_channels=config.decoder_hidden_size, + out_channels=config.decoder_hidden_size, + kernel_size=2, + stride=2, + padding=0, + bias=False, + ) + + self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True) + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, hidden_state, residual=None): + if residual is not None: + hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual)) + + hidden_state = self.residual_layer2(hidden_state) + if self.use_deconv: + hidden_state = self.deconv(hidden_state) + hidden_state = self.projection(hidden_state) + + return hidden_state + + +# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage +class DepthProDecoder(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() + self.config = config + + self.hidden_size = config.decoder_hidden_size + self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims + + self.projections = nn.ModuleList() + self.fusions = nn.ModuleList() + for i, dim in enumerate(self.decoder_feature_dims): + + # Projection + if i != 0: + # conv for hidden_states[1:] + projection = nn.Conv2d( + in_channels=dim, + out_channels=self.hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + elif self.hidden_size != dim: + # first hidden_state with dim differnet from hidden_size + projection = nn.Conv2d( + in_channels=dim, + out_channels=self.hidden_size, + kernel_size=1, + bias=False, + ) + else: + # first hidden_state with dim same as hidden_size + projection = nn.Identity() + self.projections.append(projection) + + # Fusion + fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0)) + self.fusions.append(fusion) + + def forward(self, hidden_states): + + if len(hidden_states) != len(self.decoder_feature_dims): + raise ValueError( + f"Got number of hidden_states = {len(hidden_states)}," + f"expected number of hidden_states = {len(self.decoder_feature_dims)}." + ) + + # first extract the low_res_features + last_features = hidden_states[-1] + last_features = self.projections[-1](last_features) + low_res_features = last_features # required later for fov_encoder + last_features = self.fusions[-1](last_features) + + # now get features through each layer + for i in range(len(hidden_states) - 2, -1, -1): + hidden_state = hidden_states[i] + projection = self.projections[i] + fusion = self.fusions[i] + + projected = projection(hidden_state) + last_features = fusion(last_features, projected) + + return last_features, low_res_features + + +class DepthProPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DepthProViTSwiGLUFFN"] + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +DEPTH_PRO_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_PRO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", + DEPTH_PRO_START_DOCSTRING, +) +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.use_fov = config.use_fov + + # dinov2 (vit) like encoder + self.encoder = DepthProEncoder(config) + # dpt (vit) like decoder + self.decoder = DepthProDecoder(config) + # dinov2 (vit) like encoder + self.fov_model = DepthProFOVModel(config) if self.use_fov else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + embeddings = { + "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, + "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, + } + if self.use_fov: + embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings + return embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # TODO + # @add_code_sample_docstrings( + # checkpoint=_CHECKPOINT_FOR_DOC, + # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, + # config_class=_CONFIG_FOR_DOC, + # modality="vision", + # expected_output=_EXPECTED_OUTPUT_SHAPE, + # ) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encodings = self.encoder( + pixel_values, + head_mask, + output_attentions, + output_hidden_states, + return_dict, + ) + + encodings_last_hidden_state = encodings.last_hidden_state + + for i in range(len(encodings_last_hidden_state)): + ic(encodings_last_hidden_state[i].shape) + + features, low_res_features = self.decoder(encodings_last_hidden_state) + + ic(features.shape) + ic(low_res_features.shape) + # ic(features); exit() + + if self.use_fov: + fov_out = self.fov_model( + pixel_values=pixel_values, + low_res_features=low_res_features.detach(), + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + fov_out = None + + return features, fov_out + + +class DepthProDepthEstimationHead(nn.Module): + """ + # TODO + Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples + the predictions to the input resolution after the first convolutional layer (details can be found in the paper's + supplementary material). + """ + + def __init__(self, config): + super().__init__() + self.config = config + + features = config.decoder_hidden_size + self.head = nn.Sequential( + nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features//2, out_channels=features//2, + kernel_size=2, stride=2, padding=0, bias=True + ), + nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ) + + + def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: + predicted_depth = self.head(hidden_states) + predicted_depth = predicted_depth.squeeze(dim=1) + return predicted_depth + + +@add_start_docstrings( + """ + DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). + """, + DEPTH_PRO_START_DOCSTRING, +) +class DepthProForDepthEstimation(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.depth_pro = DepthProModel(config) + self.head = DepthProDepthEstimationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth depth estimation maps for computing the loss. + + Returns: + + Examples: + ```python + >>> from transformers import AutoImageProcessor, DPTForDepthEstimation + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large") + >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") + + >>> # prepare image for the model + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + ... predicted_depth = outputs.predicted_depth + + >>> # interpolate to original size + >>> prediction = torch.nn.functional.interpolate( + ... predicted_depth.unsqueeze(1), + ... size=image.size[::-1], + ... mode="bicubic", + ... align_corners=False, + ... ) + + >>> # visualize the prediction + >>> output = prediction.squeeze().cpu().numpy() + >>> formatted = (output * 255 / np.max(output)).astype("uint8") + >>> depth = Image.fromarray(formatted) + ```""" + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + outputs = [None] * 4 + + hidden_states, fov_out = self.depth_pro( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + predicted_depth = self.head(hidden_states) + ic(predicted_depth.shape) + ic(fov_out.shape) + + # ic(predicted_depth); exit() + ic(fov_out); exit() + + if not return_dict: + if output_hidden_states: + output = (predicted_depth,) + outputs[1:] + else: + output = (predicted_depth,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return DepthEstimatorOutput( + loss=loss, + predicted_depth=predicted_depth, + # hidden_states=outputs.hidden_states, + # attentions=outputs.attentions, + ) From 1728a2ff687435bc615a8c67d9a4f55baa6ff8d4 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 9 Nov 2024 16:23:06 +0500 Subject: [PATCH 002/151] refactor model architechture --- .../depth_pro/configuration_depth_pro.py | 19 +- .../models/depth_pro/modeling_depth_pro.py | 478 ++++++++++-------- 2 files changed, 288 insertions(+), 209 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index ad0f1016f7a1..7e66e679c67f 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -129,9 +129,18 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, + patch_encoder_feature_dims = [256, 512, 1024, 1024], + patch_encoder_hook_ids = [5, 11], # patch_encoder_hook_ids = [5, 11, 17, 23], - patch_encoder_feature_dims = [256, 512, 1024, 1024], + intermediate_feature_dims = [256, 256], + intermediate_upsample_layers = [3, 2], + high_res_feature_dims = 512, + med_res_feature_dims = 1024, + low_res_feature_dims = 1024, + image_feature_dims = 1024, + global_feature_dims = 1024, + use_batch_norm_in_decoder=False, use_fov=False, **kwargs, @@ -165,3 +174,11 @@ def __init__( self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov = use_fov + + self.intermediate_feature_dims = intermediate_feature_dims + self.intermediate_upsample_layers = intermediate_upsample_layers + self.high_res_feature_dims = high_res_feature_dims + self.med_res_feature_dims = med_res_feature_dims + self.low_res_feature_dims = low_res_feature_dims + self.image_feature_dims = image_feature_dims + self.global_feature_dims = global_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f73b74060f57..74669bc4e557 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -568,105 +568,112 @@ def forward( ) +class DepthProUpsampleBlock(nn.Module): + def __init__( + self, + input_dims, + intermediate_dims, + output_dims, + n_upsample_layers, + use_proj=True, + bias=False, + ) -> None: + super().__init__() + + # create first projection block + if use_proj: + self.proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + else: + self.proj = nn.Identity() + + # create following upsample blocks + self.upsample_blocks = nn.Sequential() + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.upsample_blocks.append(layer) + + def forward(self, features): + projected = self.proj(features) + return self.upsample_blocks(projected) + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size + self.patch_encoder_hook_ids = config.patch_encoder_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.intermediate_upsample_layers = config.intermediate_upsample_layers + self.out_size = 24 # TODO: image_size // patch_size # patch encoder self.patch_encoder = DepthProViT(config) - self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[0]].register_forward_hook( - self._intermediate0_hook - ) - self.patch_encoder.encoder.layer[config.patch_encoder_hook_ids[1]].register_forward_hook( - self._intermediate1_hook - ) # image encoder self.image_encoder = DepthProViT(config) - # upsampling features (1-2) - self.upsample_intermediate0 = self._create_project_upsample_block( - input_dims=config.hidden_size, - intermediate_dims=config.patch_encoder_feature_dims[0], - output_dims=config.decoder_hidden_size, - n_upsample_layers=3, - ) - self.upsample_intermediate1 = self._create_project_upsample_block( - input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[0], - n_upsample_layers=2, - ) + # upsampling intermediate features - (1-2) in diagram + self.upsample_intermediate = nn.ModuleList() + for i, (feature_dims, upsample_layers) in enumerate(zip( + self.intermediate_feature_dims, + self.intermediate_upsample_layers, + )): + intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims + upsample_block = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=upsample_layers, + ) + self.upsample_intermediate.append(upsample_block) - # upsampling features (3-5) - self.upsample_high_res = self._create_project_upsample_block( + # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram + self.upsample_high_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[1], + intermediate_dims=config.high_res_feature_dims, + output_dims=config.high_res_feature_dims, n_upsample_layers=1, ) - self.upsample_med_res = self._create_project_upsample_block( + self.upsample_med_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[2], + intermediate_dims=config.med_res_feature_dims, + output_dims=config.med_res_feature_dims, n_upsample_layers=1, ) - self.upsample_low_res = self._create_project_upsample_block( + self.upsample_low_res = DepthProUpsampleBlock( input_dims=config.hidden_size, - output_dims=config.patch_encoder_feature_dims[3], + intermediate_dims=config.low_res_feature_dims, + output_dims=config.low_res_feature_dims, n_upsample_layers=1, ) - # upsampling features (6) - self.upsample_image = nn.ConvTranspose2d( - in_channels=config.hidden_size, - out_channels=config.patch_encoder_feature_dims[3], - kernel_size=2, - stride=2, - padding=0, - bias=True, - ) - self.fuse_image_with_low_res = nn.Conv2d( - in_channels=(config.patch_encoder_feature_dims[3] + config.patch_encoder_feature_dims[3]), - out_channels=config.patch_encoder_feature_dims[3], - kernel_size=1, - stride=1, - padding=0, + # upsampling image features - (6) in diagram + self.upsample_image = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=config.hidden_size, + output_dims=config.image_feature_dims, + n_upsample_layers=1, + use_proj=False, bias=True, ) - def _intermediate0_hook(self, model, input, output): - self.intermediate0_hidden_states = output[0] - - def _intermediate1_hook(self, model, input, output): - self.intermediate1_hidden_states = output[0] - - def _create_project_upsample_block( - self, - input_dims: int, - output_dims: int, - n_upsample_layers: int, - intermediate_dims: Optional[int] = None, - ) -> nn.Module: - - intermediate_dims = intermediate_dims or output_dims - - # Projection block followed by upsampling blocks. - blocks = [ - nn.Conv2d(input_dims, intermediate_dims, kernel_size=1, stride=1, padding=0, bias=False) - ] + [ - nn.ConvTranspose2d( - in_channels=(intermediate_dims if i == 0 else output_dims), - out_channels=output_dims, - kernel_size=2, - stride=2, - padding=0, - bias=False - ) for i in range(n_upsample_layers) - ] - - return nn.Sequential(*blocks) - def _interpolate(self, pixel_values, scale_factor): return nn.functional.interpolate( pixel_values, @@ -771,97 +778,100 @@ def forward( dim=0, ) - # STEP 3: apply patch encoder + # STEP 3: apply patch and image encoder patch_encodings = self.patch_encoder( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_hidden_states=True, # required for intermediate features return_dict=True, ) - patch_features = patch_encodings[0] - patch_features = self._reshape_feature( - patch_features, self.out_size, self.out_size + image_encodings = self.image_encoder( + pixel_values=low_res_patches, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, ) - # STEP 4: Get Intermediate Features (features 1 and 2) - - intermediate0_features = self._reshape_feature( - self.intermediate0_hidden_states, - self.out_size, - self.out_size, - ) - intermediate1_features = self._reshape_feature( - self.intermediate1_hidden_states, - self.out_size, - self.out_size, - ) - intermediate0_features = self._merge( - intermediate0_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 - ) - intermediate1_features = self._merge( - intermediate1_features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 - ) + # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram - # STEP 5: Get Patch Encoder Features (features 3-5) + # a. extract hidden_state + hidden_state = patch_encodings.last_hidden_state + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, self.out_size, self.out_size + ) high_res_features, med_res_features, low_res_features = torch.split( - patch_features, + features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, ) + # c. merge patches back together high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) - low_res_features = low_res_features + low_res_features = low_res_features # no merge required with low res image - # STEP 6: Get Image Encoder Features (features 6) + # d. upsample + high_res_features = self.upsample_high_res(high_res_features) + med_res_features = self.upsample_med_res(med_res_features) + low_res_features = self.upsample_low_res(low_res_features) - image_encodings = self.image_encoder( - pixel_values=low_res_patches, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - image_features = image_encodings[0] - image_features = self._reshape_feature( - image_features, self.out_size, self.out_size - ) + # STEP 5: get intermediate features - (1-2) in diagram - # STEP 7: Upsample All Features (feature 1-6) + intermediate_features = [] + for layer_id in self.patch_encoder_hook_ids: + + # a. extract hidden_state + hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well - # feature (1-2) - intermediate0_features = self.upsample_intermediate0( - intermediate0_features - ) - intermediate1_features = self.upsample_intermediate1( - intermediate1_features + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, + self.out_size, + self.out_size, + ) + + # c. merge patches back together + features = self._merge( + features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + ) + + # d. upsample + features = self.upsample_intermediate[layer_id](features) + + intermediate_features.append(features) + + # STEP 6: get image features - (6) in diagram + + # a. extract hidden_state + hidden_state = image_encodings.last_hidden_state + + # b. reshape back to image like + image_features = self._reshape_feature( + hidden_state, self.out_size, self.out_size ) - # feature (3-5) - high_res_features = self.upsample_high_res(high_res_features) - med_res_features = self.upsample_med_res(med_res_features) - low_res_features = self.upsample_low_res(low_res_features) + # c. merge patches back together + # skipped, no merge required with low res image - # feature (6) + # d. upsample image_features = self.upsample_image(image_features) - image_features = self.fuse_image_with_low_res( - torch.cat((low_res_features, image_features), dim=1) - ) + # STEP 7: return these features last_hidden_state = [ - intermediate0_features, - intermediate1_features, + *intermediate_features, high_res_features, med_res_features, - # low_res_features, - image_features, # fused with low_res_features + low_res_features, + image_features, ] - hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_attentions else None - attentions = patch_encodings.attentions + image_encodings.attentions if output_hidden_states else None + hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None if not return_dict: return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) @@ -882,7 +892,7 @@ def __init__(self, config: DepthProConfig) -> None: self.encoder = DepthProViT(config) self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) - self.low_res_neck = nn.Sequential( + self.global_neck = nn.Sequential( nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) @@ -897,7 +907,7 @@ def __init__(self, config: DepthProConfig) -> None: def forward( self, pixel_values: torch.Tensor, - low_res_features: torch.Tensor, + global_features: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, @@ -923,19 +933,19 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) - image_features = encoder_outputs[0] + last_hidden_state = encoder_outputs[0] - image_features = self.encoder_neck(image_features) + last_hidden_state = self.encoder_neck(last_hidden_state) # TODO: add some comments - image_features = image_features[:, 1:] - image_features = image_features.permute(0, 2, 1) + last_hidden_state = last_hidden_state[:, 1:] + last_hidden_state = last_hidden_state.permute(0, 2, 1) - low_res_features = self.low_res_neck(low_res_features) + global_features = self.global_neck(global_features) - image_features = image_features.reshape_as(low_res_features) - image_features = image_features + low_res_features - fov_output = self.head(image_features) + last_hidden_state = last_hidden_state.reshape_as(global_features) + last_hidden_state = last_hidden_state + global_features + fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(1) if not return_dict: @@ -1040,65 +1050,126 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - self.hidden_size = config.decoder_hidden_size - self.decoder_feature_dims = [config.decoder_hidden_size] + config.patch_encoder_feature_dims - - self.projections = nn.ModuleList() - self.fusions = nn.ModuleList() - for i, dim in enumerate(self.decoder_feature_dims): - - # Projection - if i != 0: - # conv for hidden_states[1:] - projection = nn.Conv2d( - in_channels=dim, - out_channels=self.hidden_size, + # for STEP 2: fuse low_res and image features + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.low_res_feature_dims+config.image_feature_dims, + out_channels=config.global_feature_dims, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + + # for STEP 3: apply decoder block for global features + self.global_proj = nn.Conv2d( + in_channels=config.global_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.global_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 4: apply decoder block for med features + self.med_res_proj = nn.Conv2d( + in_channels=config.med_res_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.med_res_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 5: apply decoder block for high features + self.high_res_proj = nn.Conv2d( + in_channels=config.high_res_feature_dims, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.high_res_fusion = DepthProFeatureFusionLayer(config) + + # for STEP 6: apply decoder block for intermediate features + self.intermediate_proj = nn.Sequential() + self.intermediate_fusion = nn.Sequential() + for i, feature_dim in enumerate(config.intermediate_feature_dims): + if i == 0: + # no projection for final intermediate layer + proj = nn.Identity() + fusion = DepthProFeatureFusionLayer(config, use_deconv=False) + else: + proj = nn.Conv2d( + in_channels=feature_dim, + out_channels=config.decoder_hidden_size, kernel_size=3, stride=1, padding=1, bias=False, ) - elif self.hidden_size != dim: - # first hidden_state with dim differnet from hidden_size - projection = nn.Conv2d( - in_channels=dim, - out_channels=self.hidden_size, - kernel_size=1, - bias=False, - ) - else: - # first hidden_state with dim same as hidden_size - projection = nn.Identity() - self.projections.append(projection) + fusion = DepthProFeatureFusionLayer(config) - # Fusion - fusion = DepthProFeatureFusionLayer(config, use_deconv=(i!=0)) - self.fusions.append(fusion) + self.intermediate_proj.append(proj) + self.intermediate_fusion.append(fusion) def forward(self, hidden_states): - if len(hidden_states) != len(self.decoder_feature_dims): - raise ValueError( - f"Got number of hidden_states = {len(hidden_states)}," - f"expected number of hidden_states = {len(self.decoder_feature_dims)}." - ) + # STEP 1: extract features - # first extract the low_res_features - last_features = hidden_states[-1] - last_features = self.projections[-1](last_features) - low_res_features = last_features # required later for fov_encoder - last_features = self.fusions[-1](last_features) + intermediate_features = hidden_states[:-4] + # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384] + high_res_features = hidden_states[-4] + # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192] + med_res_features = hidden_states[-3] + # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96] + low_res_features = hidden_states[-2] + # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48] + image_features = hidden_states[-1] + # image_features.shape: [batch_size, config.image_feature_dims, 48, 48] - # now get features through each layer - for i in range(len(hidden_states) - 2, -1, -1): - hidden_state = hidden_states[i] - projection = self.projections[i] - fusion = self.fusions[i] + # STEP 2: fuse low_res and image features - projected = projection(hidden_state) - last_features = fusion(last_features, projected) + global_features = torch.cat((low_res_features, image_features), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + # global_features.shape: [batch_size, config.global_feature_dims, 48, 48] - return last_features, low_res_features + # STEP 3: apply decoder block for global features + + # apply projection: used by fusion now and then fov later + global_projected = self.global_proj(global_features) + # apply fusion: used by next projections and fusions + last_features = self.global_fusion(global_projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96] + + # STEP 4: apply decoder block for med features + + projected = self.med_res_proj(med_res_features) + last_features = self.med_res_fusion(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192] + + # STEP 5: apply decoder block for high features + + projected = self.high_res_proj(high_res_features) + last_features = self.high_res_fusion(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384] + + # STEP 6: apply decoder block for intermediate features + + for (features, proj_layer, fusion_layer) in zip( + # reversed becuase decoding is applied from last features to first features + intermediate_features[::-1], + self.intermediate_proj[::-1], + self.intermediate_fusion[::-1], + ): + projected = proj_layer(features) + last_features = fusion_layer(last_features, projected) + # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] + # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] + + return last_features, global_projected class DepthProPreTrainedModel(PreTrainedModel): @@ -1233,26 +1304,18 @@ def forward( encodings = self.encoder( pixel_values, head_mask, - output_attentions, - output_hidden_states, - return_dict, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, ) - encodings_last_hidden_state = encodings.last_hidden_state - - for i in range(len(encodings_last_hidden_state)): - ic(encodings_last_hidden_state[i].shape) - - features, low_res_features = self.decoder(encodings_last_hidden_state) - - ic(features.shape) - ic(low_res_features.shape) - # ic(features); exit() + last_hidden_state = encodings[0] + last_hidden_state, global_features = self.decoder(last_hidden_state) if self.use_fov: fov_out = self.fov_model( pixel_values=pixel_values, - low_res_features=low_res_features.detach(), + global_features=global_features.detach(), head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -1261,7 +1324,8 @@ def forward( else: fov_out = None - return features, fov_out + # TODO: return all hidden_states + return last_hidden_state, fov_out class DepthProDepthEstimationHead(nn.Module): @@ -1375,18 +1439,16 @@ def forward( outputs = [None] * 4 - hidden_states, fov_out = self.depth_pro( + last_hidden_state, fov_out = self.depth_pro( pixel_values=pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) - predicted_depth = self.head(hidden_states) - ic(predicted_depth.shape) - ic(fov_out.shape) + predicted_depth = self.head(last_hidden_state) - # ic(predicted_depth); exit() + ic(predicted_depth) ic(fov_out); exit() if not return_dict: From 11ce50c5cf2c87839909da806b1a9dc1665c11f2 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 12 Nov 2024 10:49:46 +0500 Subject: [PATCH 003/151] update model outputs --- .../models/depth_pro/modeling_depth_pro.py | 77 ++++++++++++++----- 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 74669bc4e557..daa2bbbdd64b 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -24,9 +24,10 @@ from torch import nn from dataclasses import dataclass +from ...utils import ModelOutput from ...activations import ACT2FN from ...modeling_outputs import ( - BaseModelOutput, + BaseModelOutput, DepthEstimatorOutput ) from ...utils import ( add_code_sample_docstrings, @@ -1232,6 +1233,18 @@ def _init_weights(self, module): """ +@dataclass +class DepthProModelOutput(BaseModelOutput): + """ + Base class for model's outputs, with potential fov, hidden states and attentions. + + Args: + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + Field of View Scaler. + """ + fov: Optional[torch.FloatTensor] = None + + @add_start_docstrings( "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", DEPTH_PRO_START_DOCSTRING, @@ -1306,14 +1319,14 @@ def forward( head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) - last_hidden_state = encodings[0] + last_hidden_state = encodings.last_hidden_state last_hidden_state, global_features = self.decoder(last_hidden_state) if self.use_fov: - fov_out = self.fov_model( + fov_encodings = self.fov_model( pixel_values=pixel_values, global_features=global_features.detach(), head_mask=head_mask, @@ -1321,11 +1334,24 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) + fov = fov_encodings.last_hidden_state else: - fov_out = None + fov = None + + attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + + if not return_dict: + outputs = (last_hidden_state, fov, hidden_states, attentions) + outputs = (i for i in outputs if i is not None) + return outputs - # TODO: return all hidden_states - return last_hidden_state, fov_out + return DepthProModelOutput( + last_hidden_state=last_hidden_state, + fov=fov, + hidden_states=hidden_states, + attentions=attentions, + ) class DepthProDepthEstimationHead(nn.Module): @@ -1360,6 +1386,18 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: return predicted_depth +@dataclass +class DepthProDepthEstimatorOutput(DepthEstimatorOutput): + """ + Base class for outputs of DepthProDepthEstimator. + + Args: + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + Field of View Scaler. + """ + fov: Optional[torch.FloatTensor] = None + + @add_start_docstrings( """ DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). @@ -1436,31 +1474,28 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + # use_fov = use_fov if use_fov is not None else self.config.use_fov - outputs = [None] * 4 - - last_hidden_state, fov_out = self.depth_pro( + depth_pro_outputs = self.depth_pro( pixel_values=pixel_values, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) + last_hidden_state = depth_pro_outputs[0] predicted_depth = self.head(last_hidden_state) - ic(predicted_depth) - ic(fov_out); exit() - if not return_dict: - if output_hidden_states: - output = (predicted_depth,) + outputs[1:] + if loss is None: + return (predicted_depth,) + depth_pro_outputs[1:] else: - output = (predicted_depth,) + outputs[2:] - return ((loss,) + output) if loss is not None else output + return (loss, predicted_depth) + depth_pro_outputs[1:] - return DepthEstimatorOutput( + return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, - # hidden_states=outputs.hidden_states, - # attentions=outputs.attentions, + fov=depth_pro_outputs.fov, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, ) From 27e9593ada48c5c17a3a96e67bff534e022359ad Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:23:03 +0500 Subject: [PATCH 004/151] update init param to include use_fov_model --- .../models/depth_pro/modeling_depth_pro.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index daa2bbbdd64b..f8b69bfec86e 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1239,7 +1239,7 @@ class DepthProModelOutput(BaseModelOutput): Base class for model's outputs, with potential fov, hidden states and attentions. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ fov: Optional[torch.FloatTensor] = None @@ -1250,17 +1250,17 @@ class DepthProModelOutput(BaseModelOutput): DEPTH_PRO_START_DOCSTRING, ) class DepthProModel(DepthProPreTrainedModel): - def __init__(self, config): + def __init__(self, config, use_fov_model=None): super().__init__(config) self.config = config - self.use_fov = config.use_fov + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model # dinov2 (vit) like encoder self.encoder = DepthProEncoder(config) # dpt (vit) like decoder self.decoder = DepthProDecoder(config) # dinov2 (vit) like encoder - self.fov_model = DepthProFOVModel(config) if self.use_fov else None + self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None # Initialize weights and apply final processing self.post_init() @@ -1325,7 +1325,7 @@ def forward( last_hidden_state = encodings.last_hidden_state last_hidden_state, global_features = self.decoder(last_hidden_state) - if self.use_fov: + if self.use_fov_model: fov_encodings = self.fov_model( pixel_values=pixel_values, global_features=global_features.detach(), @@ -1392,7 +1392,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): Base class for outputs of DepthProDepthEstimator. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov` is provided): + fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ fov: Optional[torch.FloatTensor] = None @@ -1405,10 +1405,11 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): DEPTH_PRO_START_DOCSTRING, ) class DepthProForDepthEstimation(DepthProPreTrainedModel): - def __init__(self, config): + def __init__(self, config, use_fov_model=None): super().__init__(config) + self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - self.depth_pro = DepthProModel(config) + self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model) self.head = DepthProDepthEstimationHead(config) # Initialize weights and apply final processing @@ -1474,7 +1475,6 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - # use_fov = use_fov if use_fov is not None else self.config.use_fov depth_pro_outputs = self.depth_pro( pixel_values=pixel_values, From e74a7f505f91a24117e7838e367b72a50ff9e8f1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:24:21 +0500 Subject: [PATCH 005/151] update param name in config --- src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 7e66e679c67f..a4037c99ee0f 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -142,7 +142,7 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, - use_fov=False, + use_fov_model=False, **kwargs, ): super().__init__(**kwargs) @@ -173,7 +173,7 @@ def __init__( self.patch_encoder_hook_ids = patch_encoder_hook_ids self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder - self.use_fov = use_fov + self.use_fov_model = use_fov_model self.intermediate_feature_dims = intermediate_feature_dims self.intermediate_upsample_layers = intermediate_upsample_layers From 8c2460b0655dd3ef698b765eb64c79cc785c7d10 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 10:51:56 +0500 Subject: [PATCH 006/151] fix hidden_states and attentions outputs for fov --- src/transformers/models/depth_pro/modeling_depth_pro.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f8b69bfec86e..620133771c06 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1332,14 +1332,15 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, ) fov = fov_encodings.last_hidden_state + attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None else: fov = None - - attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + attentions = encodings.attentions + hidden_states = encodings.hidden_states if not return_dict: outputs = (last_hidden_state, fov, hidden_states, attentions) From 55f6ed3439cef2a731b8b78cba3b6142e3125447 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 11:20:56 +0500 Subject: [PATCH 007/151] sort config --- .../models/depth_pro/configuration_depth_pro.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index a4037c99ee0f..16ff55e9cb6c 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -129,10 +129,7 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, - patch_encoder_feature_dims = [256, 512, 1024, 1024], - patch_encoder_hook_ids = [5, 11], - # patch_encoder_hook_ids = [5, 11, 17, 23], intermediate_feature_dims = [256, 256], intermediate_upsample_layers = [3, 2], high_res_feature_dims = 512, @@ -140,7 +137,6 @@ def __init__( low_res_feature_dims = 1024, image_feature_dims = 1024, global_feature_dims = 1024, - use_batch_norm_in_decoder=False, use_fov_model=False, **kwargs, @@ -171,10 +167,8 @@ def __init__( self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states self.patch_encoder_hook_ids = patch_encoder_hook_ids - self.patch_encoder_feature_dims = patch_encoder_feature_dims self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model - self.intermediate_feature_dims = intermediate_feature_dims self.intermediate_upsample_layers = intermediate_upsample_layers self.high_res_feature_dims = high_res_feature_dims From b25dffb5d7f0aef86bb7c2dac990c24b28dafb5a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 11:21:13 +0500 Subject: [PATCH 008/151] complete minor todos --- .../models/depth_pro/modeling_depth_pro.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 620133771c06..956fe7afb7f7 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -938,8 +938,7 @@ def forward( last_hidden_state = self.encoder_neck(last_hidden_state) - # TODO: add some comments - last_hidden_state = last_hidden_state[:, 1:] + last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token last_hidden_state = last_hidden_state.permute(0, 2, 1) global_features = self.global_neck(global_features) @@ -1357,10 +1356,10 @@ def forward( class DepthProDepthEstimationHead(nn.Module): """ - # TODO - Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples - the predictions to the input resolution after the first convolutional layer (details can be found in the paper's - supplementary material). + The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. + This module comprises a sequence of convolutional and transposed convolutional layers + that process the feature map from the decoder to produce a single-channel depth map. + Key operations include dimensionality reduction and upsampling to match the input resolution. """ def __init__(self, config): From c225deb0d126a8420ccb5e381fa2e120abedabf0 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 13:22:15 +0500 Subject: [PATCH 009/151] update patching --- .../models/depth_pro/modeling_depth_pro.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 956fe7afb7f7..59b6d46e30ca 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -685,23 +685,25 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - patch_size = 384 # TODO: this should be infered - patch_stride = int(patch_size * (1 - overlap_ratio)) + B, C, H, W = pixel_values.shape - image_size = pixel_values.shape[-1] - steps = int(math.ceil((image_size - patch_size) / patch_stride)) + 1 + patch_size = 384 # TODO: this should be inferred + stride = int(patch_size * (1 - overlap_ratio)) - x_patch_list = [] - for j in range(steps): - j0 = j * patch_stride - j1 = j0 + patch_size + if pixel_values.dim() != 4: + raise ValueError("Input tensor must have shape (B, C, H, W).") - for i in range(steps): - i0 = i * patch_stride - i1 = i0 + patch_size - x_patch_list.append(pixel_values[..., j0:j1, i0:i1]) + # pixel_values.shape (B, C, H, W) + patches = torch.nn.functional.unfold( + pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) + ) + # patches.shape (B, -1, num_patches) + patches = patches.permute(2, 0, 1) + # patches.shape (num_patches, B, -1) + patches = patches.reshape(-1, C, patch_size, patch_size) + # patches.shape (B * num_patches, C, patch_size, patch_size) - return torch.cat(x_patch_list, dim=0) + return patches def _reshape_feature( self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 @@ -760,7 +762,7 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - batch_size = pixel_values.shape[0] + B, C, H, W = pixel_values.shape # STEP 1: create 3-level image @@ -812,8 +814,8 @@ def forward( ) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=batch_size, padding=3) - med_res_features = self._merge(med_res_features, batch_size=batch_size, padding=6) + high_res_features = self._merge(high_res_features, batch_size=B, padding=3) + med_res_features = self._merge(med_res_features, batch_size=B, padding=6) low_res_features = low_res_features # no merge required with low res image # d. upsample @@ -838,7 +840,7 @@ def forward( # c. merge patches back together features = self._merge( - features[: batch_size * 5 * 5], batch_size=batch_size, padding=3 + features[: B * 5 * 5], batch_size=B, padding=3 ) # d. upsample From 176932dc6aba7bfaf541bee756fc493f541434dd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 16:35:43 +0500 Subject: [PATCH 010/151] update config for encoder --- .../depth_pro/configuration_depth_pro.py | 14 ++- .../models/depth_pro/modeling_depth_pro.py | 108 ++++++++++-------- 2 files changed, 71 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 16ff55e9cb6c..cdf3cf4d8d70 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -119,7 +119,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-6, image_size=384, - patch_size=16, # changed + patch_size=16, # TODO remove this num_channels=3, qkv_bias=True, layerscale_value=1.0, @@ -139,6 +139,13 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, use_fov_model=False, + + # aux_image_size=1536, + # aux_patch_size=384, + aux_image_size=1536 // 2, + aux_patch_size=384 // 2, + aux_num_channels=3, + patch_embeddings_size=16, **kwargs, ): super().__init__(**kwargs) @@ -176,3 +183,8 @@ def __init__( self.low_res_feature_dims = low_res_feature_dims self.image_feature_dims = image_feature_dims self.global_feature_dims = global_feature_dims + + self.aux_image_size = aux_image_size + self.aux_patch_size = aux_patch_size + self.aux_num_channels = aux_num_channels + self.patch_embeddings_size = patch_embeddings_size diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 59b6d46e30ca..3d3d356cc0ee 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -55,22 +55,22 @@ class DepthProViTPatchEmbeddings(nn.Module): def __init__(self, config): super().__init__() - image_size, patch_size = config.image_size, config.patch_size - num_channels, hidden_size = config.num_channels, config.hidden_size - image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) - patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) - num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) - self.image_size = image_size - self.patch_size = patch_size - self.num_channels = num_channels - self.num_patches = num_patches - - self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + self.config = config + self.in_channels = config.aux_num_channels + self.out_channels = config.hidden_size + self.patch_embeddings_size = config.patch_embeddings_size + + self.projection = nn.Conv2d( + self.in_channels, + self.out_channels, + kernel_size=(self.patch_embeddings_size, self.patch_embeddings_size), + stride=(self.patch_embeddings_size, self.patch_embeddings_size), + ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.num_channels: + if num_channels != self.config.aux_num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -89,10 +89,12 @@ class DepthProViTEmbeddings(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() + self.config = config + self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2 + self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) - num_patches = self.patch_embeddings.num_patches - self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) + self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.patch_size = config.patch_size self.config = config @@ -107,11 +109,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_patches = embeddings.shape[1] - 1 num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes - if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: return self.position_embeddings class_pos_embed = self.position_embeddings[:, :1] @@ -119,8 +120,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: dim = embeddings.shape[-1] - new_height = height // self.patch_size - new_width = width // self.patch_size + new_height = height // self.patch_size # TODO: check this + new_width = width // self.patch_size # TODO: check this sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) @@ -621,8 +622,9 @@ def __init__(self, config: DepthProConfig) -> None: self.patch_encoder_hook_ids = config.patch_encoder_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims self.intermediate_upsample_layers = config.intermediate_upsample_layers - - self.out_size = 24 # TODO: image_size // patch_size + + self.out_size = config.aux_patch_size // config.patch_embeddings_size + self.seq_len = self.out_size ** 2 # patch encoder self.patch_encoder = DepthProViT(config) @@ -685,23 +687,18 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - B, C, H, W = pixel_values.shape - - patch_size = 384 # TODO: this should be inferred + patch_size = self.config.aux_patch_size stride = int(patch_size * (1 - overlap_ratio)) - if pixel_values.dim() != 4: - raise ValueError("Input tensor must have shape (B, C, H, W).") - - # pixel_values.shape (B, C, H, W) + # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) patches = torch.nn.functional.unfold( pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) ) # patches.shape (B, -1, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, C, patch_size, patch_size) - # patches.shape (B * num_patches, C, patch_size, patch_size) + patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size) + # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) return patches @@ -762,24 +759,33 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if pixel_values.dim() != 4: + raise ValueError("Input tensor must have shape (B, C, H, W).") + B, C, H, W = pixel_values.shape + # TODO validate: H = W = aux_image_size + # TODO validate: C = aux_num_channels + # TODO validate: aux_image_size = aux_patch_size * 4 + + # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # STEP 1: create 3-level image - high_res = pixel_values - med_res = self._interpolate(pixel_values, 0.5) - low_res = self._interpolate(pixel_values, 0.25) + high_res = pixel_values # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + med_res = self._interpolate(pixel_values, 0.5) # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2) + low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) - med_res_patches = self._patch(med_res, 0.5) - low_res_patches = low_res + high_res_patches = self._patch(high_res, 0.25) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + med_res_patches = self._patch(med_res, 0.5) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + low_res_patches = low_res # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) patches = torch.cat( (high_res_patches, med_res_patches, low_res_patches), dim=0, - ) + ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) # STEP 3: apply patch and image encoder @@ -801,42 +807,43 @@ def forward( # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram # a. extract hidden_state - hidden_state = patch_encodings.last_hidden_state + hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) + ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) high_res_features, med_res_features, low_res_features = torch.split( features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, - ) + ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, padding=3) - med_res_features = self._merge(med_res_features, batch_size=B, padding=6) - low_res_features = low_res_features # no merge required with low res image + high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) + med_res_features = self._merge(med_res_features, batch_size=B, padding=6) # (B, config.hidden_size, ~, ~) + low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, ~, ~) # d. upsample - high_res_features = self.upsample_high_res(high_res_features) - med_res_features = self.upsample_med_res(med_res_features) - low_res_features = self.upsample_low_res(low_res_features) + high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, ~, ~) + med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, ~, ~) + low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, ~, ~) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] for layer_id in self.patch_encoder_hook_ids: - + # a. extract hidden_state hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well + # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size, - ) + ) # (num_patches, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = self._merge( @@ -845,24 +852,25 @@ def forward( # d. upsample features = self.upsample_intermediate[layer_id](features) + # (B, config.intermediate_feature_dims[layer_id], ~, ~) intermediate_features.append(features) # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state + hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) + ) # (num_patches, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together # skipped, no merge required with low res image # d. upsample - image_features = self.upsample_image(image_features) + image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~) # STEP 7: return these features last_hidden_state = [ From dcec5228b21352f6638c27c91f1d4056323eba95 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 16:46:17 +0500 Subject: [PATCH 011/151] fix config --- .../depth_pro/configuration_depth_pro.py | 20 +++----- .../models/depth_pro/modeling_depth_pro.py | 48 +++++++++---------- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index cdf3cf4d8d70..fc12b37b19d0 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -118,9 +118,12 @@ def __init__( attention_probs_dropout_prob=0.0, initializer_range=0.02, layer_norm_eps=1e-6, - image_size=384, - patch_size=16, # TODO remove this + # image_size=1536, + # patch_size=384, + image_size=1536 // 2, + patch_size=384 // 2, num_channels=3, + patch_embeddings_size=16, qkv_bias=True, layerscale_value=1.0, drop_path_rate=0.0, @@ -139,13 +142,6 @@ def __init__( global_feature_dims = 1024, use_batch_norm_in_decoder=False, use_fov_model=False, - - # aux_image_size=1536, - # aux_patch_size=384, - aux_image_size=1536 // 2, - aux_patch_size=384 // 2, - aux_num_channels=3, - patch_embeddings_size=16, **kwargs, ): super().__init__(**kwargs) @@ -163,6 +159,7 @@ def __init__( self.image_size = image_size self.patch_size = patch_size self.num_channels = num_channels + self.patch_embeddings_size = patch_embeddings_size self.qkv_bias = qkv_bias self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate @@ -183,8 +180,3 @@ def __init__( self.low_res_feature_dims = low_res_feature_dims self.image_feature_dims = image_feature_dims self.global_feature_dims = global_feature_dims - - self.aux_image_size = aux_image_size - self.aux_patch_size = aux_patch_size - self.aux_num_channels = aux_num_channels - self.patch_embeddings_size = patch_embeddings_size diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3d3d356cc0ee..d56391313979 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -57,7 +57,7 @@ def __init__(self, config): super().__init__() self.config = config - self.in_channels = config.aux_num_channels + self.in_channels = config.num_channels self.out_channels = config.hidden_size self.patch_embeddings_size = config.patch_embeddings_size @@ -70,7 +70,7 @@ def __init__(self, config): def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.config.aux_num_channels: + if num_channels != self.config.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -90,14 +90,12 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config - self.seq_len = (config.aux_patch_size // config.patch_embeddings_size) ** 2 + self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2 self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.patch_size = config.patch_size - self.config = config def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ @@ -120,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: dim = embeddings.shape[-1] - new_height = height // self.patch_size # TODO: check this - new_width = width // self.patch_size # TODO: check this + new_height = height // self.config.patch_embeddings_size + new_width = width // self.config.patch_embeddings_size sqrt_num_positions = torch_int(num_positions**0.5) patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) @@ -623,7 +621,7 @@ def __init__(self, config: DepthProConfig) -> None: self.intermediate_feature_dims = config.intermediate_feature_dims self.intermediate_upsample_layers = config.intermediate_upsample_layers - self.out_size = config.aux_patch_size // config.patch_embeddings_size + self.out_size = config.patch_size // config.patch_embeddings_size self.seq_len = self.out_size ** 2 # patch encoder @@ -687,18 +685,18 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): - patch_size = self.config.aux_patch_size + patch_size = self.config.patch_size stride = int(patch_size * (1 - overlap_ratio)) - # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) patches = torch.nn.functional.unfold( pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) ) # patches.shape (B, -1, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, self.config.aux_num_channels, patch_size, patch_size) - # patches.shape (B * num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size) + # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size) return patches @@ -764,28 +762,28 @@ def forward( B, C, H, W = pixel_values.shape - # TODO validate: H = W = aux_image_size - # TODO validate: C = aux_num_channels - # TODO validate: aux_image_size = aux_patch_size * 4 + # TODO validate: H = W = image_size + # TODO validate: C = num_channels + # TODO validate: image_size = patch_size * 4 - # pixel_values.shape (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) + # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) # STEP 1: create 3-level image - high_res = pixel_values # (B, config.aux_num_channels, config.aux_image_size, config.aux_image_size) - med_res = self._interpolate(pixel_values, 0.5) # (B, config.aux_num_channels, config.aux_image_size//2, config.aux_image_size//2) - low_res = self._interpolate(pixel_values, 0.25) # (B, config.aux_num_channels, config.aux_image_size//4, config.aux_image_size//4) + high_res = pixel_values # (B, config.num_channels, config.image_size, config.image_size) + med_res = self._interpolate(pixel_values, 0.5) # (B, config.num_channels, config.image_size//2, config.image_size//2) + low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) - med_res_patches = self._patch(med_res, 0.5) # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) - low_res_patches = low_res # (-1, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + high_res_patches = self._patch(high_res, 0.25) # (-1, config.num_channels, config.patch_size, config.patch_size) + med_res_patches = self._patch(med_res, 0.5) # (-1, config.num_channels, config.patch_size, config.patch_size) + low_res_patches = low_res # (-1, config.num_channels, config.patch_size, config.patch_size) patches = torch.cat( (high_res_patches, med_res_patches, low_res_patches), dim=0, - ) # (num_patches, config.aux_num_channels, config.aux_patch_size, config.aux_patch_size) + ) # (num_patches, config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -812,12 +810,12 @@ def forward( # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) + ) # (num_patches, config.num_channels, self.out_size, self.out_size) high_res_features, med_res_features, low_res_features = torch.split( features, [len(high_res_patches), len(med_res_patches), len(low_res_patches)], dim=0, - ) # (num_patches, config.aux_num_channels, self.out_size, self.out_size) + ) # (num_patches, config.num_channels, self.out_size, self.out_size) # c. merge patches back together high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) From 0384d2f189062259b3b99a3d692593e28902ec0b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 16 Nov 2024 19:37:00 +0500 Subject: [PATCH 012/151] use correct defaults in config --- .../models/depth_pro/configuration_depth_pro.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index fc12b37b19d0..aff3eb3e2941 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -118,10 +118,8 @@ def __init__( attention_probs_dropout_prob=0.0, initializer_range=0.02, layer_norm_eps=1e-6, - # image_size=1536, - # patch_size=384, - image_size=1536 // 2, - patch_size=384 // 2, + image_size=1536, + patch_size=384, num_channels=3, patch_embeddings_size=16, qkv_bias=True, From 85e4f868b65fa5b208883cb973824ca6e2557db8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 17 Nov 2024 23:47:50 +0500 Subject: [PATCH 013/151] update merge for compatibility with different image size --- .../depth_pro/configuration_depth_pro.py | 6 +- .../models/depth_pro/modeling_depth_pro.py | 135 +++++++++++------- 2 files changed, 88 insertions(+), 53 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index aff3eb3e2941..d9f973639ad0 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -108,9 +108,9 @@ class DepthProConfig(PretrainedConfig): def __init__( self, - hidden_size=1024, # changed + hidden_size=1024, decoder_hidden_size=256, - num_hidden_layers=24, # changed + num_hidden_layers=24, num_attention_heads=16, mlp_ratio=4, hidden_act="gelu", @@ -132,7 +132,6 @@ def __init__( reshape_hidden_states=True, patch_encoder_hook_ids = [5, 11], intermediate_feature_dims = [256, 256], - intermediate_upsample_layers = [3, 2], high_res_feature_dims = 512, med_res_feature_dims = 1024, low_res_feature_dims = 1024, @@ -172,7 +171,6 @@ def __init__( self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model self.intermediate_feature_dims = intermediate_feature_dims - self.intermediate_upsample_layers = intermediate_upsample_layers self.high_res_feature_dims = high_res_feature_dims self.med_res_feature_dims = med_res_feature_dims self.low_res_feature_dims = low_res_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index d56391313979..316afe444fbb 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -619,7 +619,6 @@ def __init__(self, config: DepthProConfig) -> None: self.decoder_hidden_size = config.decoder_hidden_size self.patch_encoder_hook_ids = config.patch_encoder_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims - self.intermediate_upsample_layers = config.intermediate_upsample_layers self.out_size = config.patch_size // config.patch_embeddings_size self.seq_len = self.out_size ** 2 @@ -632,17 +631,15 @@ def __init__(self, config: DepthProConfig) -> None: # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() - for i, (feature_dims, upsample_layers) in enumerate(zip( - self.intermediate_feature_dims, - self.intermediate_upsample_layers, - )): + for i, feature_dims in enumerate(self.intermediate_feature_dims): intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims upsample_block = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=upsample_layers, + n_upsample_layers=1+len(self.intermediate_feature_dims)-i, ) + self.upsample_intermediate.append(upsample_block) # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram @@ -714,34 +711,46 @@ def _reshape_feature( hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) return hidden_states - def _merge(self, x: torch.Tensor, batch_size: int, padding: int = 3) -> torch.Tensor: + def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: """Merge the patched input into a image with sliding window.""" - steps = int(math.sqrt(x.shape[0] // batch_size)) - - idx = 0 - - output_list = [] - for j in range(steps): - output_row_list = [] - for i in range(steps): - output = x[batch_size * idx : batch_size * (idx + 1)] + # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) + box_size = int(math.sqrt(x.shape[0] // batch_size)) - if j != 0: - output = output[..., padding:, :] - if i != 0: - output = output[..., :, padding:] - if j != steps - 1: - output = output[..., :-padding, :] - if i != steps - 1: - output = output[..., :, :-padding] - - output_row_list.append(output) - idx += 1 - - output_row = torch.cat(output_row_list, dim=-1) - output_list.append(output_row) - output = torch.cat(output_list, dim=-2) - return output + """ + merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) + padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) + """ + padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 ) + + i = 0 + boxes = [] + for h in range(box_size): + boxes_in_row = [] + for w in range(box_size): + box = x[batch_size * i : batch_size * (i + 1)] + + if h != 0: + # remove pad from height if box is not at top border + box = box[..., padding:, :] + if w != 0: + # remove pad from width if box is not at left border + box = box[..., :, padding:] + if h != box_size - 1: + # remove pad from height if box is not at bottom border + box = box[..., :box.shape[-2]-padding, :] + if w != box_size - 1: + # remove pad from width if box is not at right border + box = box[..., :, :box.shape[-1]-padding] + + boxes_in_row.append(box) + i += 1 + + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + + boxes = torch.cat(boxes, dim=-2) + boxes = boxes[..., :merge_out_size, :merge_out_size] + return boxes def forward( self, @@ -818,19 +827,19 @@ def forward( ) # (num_patches, config.num_channels, self.out_size, self.out_size) # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, padding=3) # (B, config.hidden_size, ~, ~) - med_res_features = self._merge(med_res_features, batch_size=B, padding=6) # (B, config.hidden_size, ~, ~) - low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, ~, ~) + high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) + med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2) # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1) + low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0) # d. upsample - high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, ~, ~) - med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, ~, ~) - low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, ~, ~) + high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3) + med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2) + low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] - for layer_id in self.patch_encoder_hook_ids: + for i, layer_id in enumerate(self.patch_encoder_hook_ids): # a. extract hidden_state hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well @@ -845,12 +854,12 @@ def forward( # c. merge patches back together features = self._merge( - features[: B * 5 * 5], batch_size=B, padding=3 - ) + features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4, + ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) # d. upsample features = self.upsample_intermediate[layer_id](features) - # (B, config.intermediate_feature_dims[layer_id], ~, ~) + # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i)) intermediate_features.append(features) @@ -868,16 +877,25 @@ def forward( # skipped, no merge required with low res image # d. upsample - image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, ~, ~) + image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) # STEP 7: return these features last_hidden_state = [ - *intermediate_features, - high_res_features, - med_res_features, - low_res_features, - image_features, + *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i) + high_res_features, # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3) + med_res_features, # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2) + low_res_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) + image_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) ] + # for i in last_hidden_state: + # ic(i.shape) + # exit() + + # 768, 384, 192, 96, 48, 48 - image_size=1536 + # 384, 192, 96, 48, 24, 24 - image_size=768 (ideal) + # 288, 144, 72, 24, 24, 24 - image_size=768 (practical) + # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal) + # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical) hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None @@ -951,6 +969,11 @@ def forward( global_features = self.global_neck(global_features) + ic(last_hidden_state.shape) + ic(global_features.shape) + + # exit() + last_hidden_state = last_hidden_state.reshape_as(global_features) last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) @@ -1107,7 +1130,15 @@ def __init__(self, config: DepthProConfig) -> None: for i, feature_dim in enumerate(config.intermediate_feature_dims): if i == 0: # no projection for final intermediate layer - proj = nn.Identity() + if feature_dim == config.decoder_hidden_size: + proj = nn.Identity() + else: + proj = nn.Conv2d( + in_channels=feature_dim, + out_channels=config.decoder_hidden_size, + kernel_size=1, + bias=False, + ) fusion = DepthProFeatureFusionLayer(config, use_deconv=False) else: proj = nn.Conv2d( @@ -1124,6 +1155,10 @@ def __init__(self, config: DepthProConfig) -> None: self.intermediate_fusion.append(fusion) def forward(self, hidden_states): + ic("Start of Decoder") + + for i in hidden_states: + ic(i.shape) # STEP 1: extract features @@ -1492,7 +1527,9 @@ def forward( return_dict=True, ) last_hidden_state = depth_pro_outputs[0] + ic(last_hidden_state.shape) predicted_depth = self.head(last_hidden_state) + ic(predicted_depth.shape) if not return_dict: if loss is None: From 00e4aa3b7bb04324cd08f2f87a2a34f4033fccca Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 11:04:58 +0500 Subject: [PATCH 014/151] restructure encoder for custom configuration --- .../depth_pro/configuration_depth_pro.py | 21 +- .../models/depth_pro/modeling_depth_pro.py | 842 ++++++++---------- 2 files changed, 395 insertions(+), 468 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d9f973639ad0..055830900417 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -59,6 +59,7 @@ class DepthProConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. image_size (`int`, *optional*, defaults to 224): + TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size The size (resolution) of each image. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. @@ -130,13 +131,11 @@ def __init__( out_indices=None, apply_layernorm=True, reshape_hidden_states=True, - patch_encoder_hook_ids = [5, 11], + intermediate_hook_ids = [11, 5], intermediate_feature_dims = [256, 256], - high_res_feature_dims = 512, - med_res_feature_dims = 1024, - low_res_feature_dims = 1024, - image_feature_dims = 1024, - global_feature_dims = 1024, + scaled_images_ratios = [0.25, 0.5, 1], + scaled_images_overlap_ratios = [0.0, 0.5, 0.25], + scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_decoder=False, use_fov_model=False, **kwargs, @@ -167,12 +166,10 @@ def __init__( ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.patch_encoder_hook_ids = patch_encoder_hook_ids self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model + self.intermediate_hook_ids = intermediate_hook_ids self.intermediate_feature_dims = intermediate_feature_dims - self.high_res_feature_dims = high_res_feature_dims - self.med_res_feature_dims = med_res_feature_dims - self.low_res_feature_dims = low_res_feature_dims - self.image_feature_dims = image_feature_dims - self.global_feature_dims = global_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 316afe444fbb..9f146177402c 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -226,7 +226,6 @@ def forward( self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' @@ -617,11 +616,40 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.decoder_hidden_size = config.decoder_hidden_size - self.patch_encoder_hook_ids = config.patch_encoder_hook_ids + + self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) self.out_size = config.patch_size // config.patch_embeddings_size - self.seq_len = self.out_size ** 2 + self.seq_len = self.out_size ** 2 # each patch is flattened + + # config.scaled_images_ratios is sorted + if config.scaled_images_ratios != sorted(config.scaled_images_ratios): + raise ValueError( + f"Values in scaled_images_ratios={config.scaled_images_ratios} " + "should be sorted from low to high" + ) + + # lowest image resolution is greator than the patch_size + if config.scaled_images_ratios[0] * config.image_size < config.patch_size: + raise ValueError( + "Image cannot be scaled to a size less than patch_size. " + f"Provide values in scaled_images_ratios={config.scaled_images_ratios} suitable " + f"to the given patch_size={config.patch_size}." + ) + + # patch_size should be a divisible by patch_embeddings_size + # else it raises an exception in DepthProViTPatchEmbeddings + if config.patch_size % config.patch_embeddings_size != 0: + raise ValueError( + f"patch_size={config.patch_size} should be divisible " + f"by patch_embeddings_size={config.patch_embeddings_size}." + ) # patch encoder self.patch_encoder = DepthProViT(config) @@ -629,6 +657,17 @@ def __init__(self, config: DepthProConfig) -> None: # image encoder self.image_encoder = DepthProViT(config) + # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram + self.upsample_scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(self.scaled_images_feature_dims): + upsample_block = DepthProUpsampleBlock( + input_dims=config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.upsample_scaled_images.append(upsample_block) + # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() for i, feature_dims in enumerate(self.intermediate_feature_dims): @@ -637,42 +676,33 @@ def __init__(self, config: DepthProConfig) -> None: input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=1+len(self.intermediate_feature_dims)-i, + n_upsample_layers=2+i, ) - self.upsample_intermediate.append(upsample_block) - # upsampling patch features (high_res, med_res, low_res) - (3-5) in diagram - self.upsample_high_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.high_res_feature_dims, - output_dims=config.high_res_feature_dims, - n_upsample_layers=1, - ) - self.upsample_med_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.med_res_feature_dims, - output_dims=config.med_res_feature_dims, - n_upsample_layers=1, - ) - self.upsample_low_res = DepthProUpsampleBlock( - input_dims=config.hidden_size, - intermediate_dims=config.low_res_feature_dims, - output_dims=config.low_res_feature_dims, - n_upsample_layers=1, - ) - # upsampling image features - (6) in diagram self.upsample_image = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=config.hidden_size, - output_dims=config.image_feature_dims, + output_dims=config.scaled_images_feature_dims[0], n_upsample_layers=1, use_proj=False, bias=True, ) + # for STEP 7: fuse low_res and image features + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0]*2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + def _interpolate(self, pixel_values, scale_factor): + if scale_factor == 1: + return pixel_values return nn.functional.interpolate( pixel_values, size=None, @@ -682,6 +712,10 @@ def _interpolate(self, pixel_values, scale_factor): ) def _patch(self, pixel_values, overlap_ratio): + if pixel_values.shape[-1] == self.config.patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + patch_size = self.config.patch_size stride = int(patch_size * (1 - overlap_ratio)) @@ -712,7 +746,11 @@ def _reshape_feature( return hidden_states def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: - """Merge the patched input into a image with sliding window.""" + if batch_size == x.shape[0]: + # merge only if the patches were created from this scaled image + # pathces are not created when scaled image size is equal to patch size + return x + # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) box_size = int(math.sqrt(x.shape[0] // batch_size)) @@ -771,28 +809,35 @@ def forward( B, C, H, W = pixel_values.shape - # TODO validate: H = W = image_size - # TODO validate: C = num_channels - # TODO validate: image_size = patch_size * 4 + if not (H == W == self.config.image_size): + raise ValueError( + f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config." + ) + + if not (C == self.config.num_channels): + raise ValueError( + f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config." + ) # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) # STEP 1: create 3-level image - high_res = pixel_values # (B, config.num_channels, config.image_size, config.image_size) - med_res = self._interpolate(pixel_values, 0.5) # (B, config.num_channels, config.image_size//2, config.image_size//2) - low_res = self._interpolate(pixel_values, 0.25) # (B, config.num_channels, config.image_size//4, config.image_size//4) + scaled_images = [] + for ratio in self.scaled_images_ratios: + scaled_images.append(self._interpolate(pixel_values, ratio)) + # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio) # STEP 2: create patches - high_res_patches = self._patch(high_res, 0.25) # (-1, config.num_channels, config.patch_size, config.patch_size) - med_res_patches = self._patch(med_res, 0.5) # (-1, config.num_channels, config.patch_size, config.patch_size) - low_res_patches = low_res # (-1, config.num_channels, config.patch_size, config.patch_size) - - patches = torch.cat( - (high_res_patches, med_res_patches, low_res_patches), - dim=0, - ) # (num_patches, config.num_channels, config.patch_size, config.patch_size) + for i in range(self.n_scaled_images): + scaled_images[i] = self._patch( + scaled_images[i], + overlap_ratio=self.scaled_images_overlap_ratios[i], + ) + scaled_images_num_patches = [len(i) for i in scaled_images] + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -803,8 +848,13 @@ def forward( output_hidden_states=True, # required for intermediate features return_dict=True, ) + scaled_images_last_hidden_state = torch.split_with_sizes( + patch_encodings.last_hidden_state, + scaled_images_num_patches[::-1] + )[::-1] # -1 as patch encoder expects high res patches first + image_encodings = self.image_encoder( - pixel_values=low_res_patches, + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -813,89 +863,87 @@ def forward( # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram - # a. extract hidden_state - hidden_state = patch_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) + scaled_images_features = [] + for i in range(self.n_scaled_images): + # a. extract hidden_state + hidden_state = scaled_images_last_hidden_state[i] + # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size) - # b. reshape back to image like - features = self._reshape_feature( - hidden_state, self.out_size, self.out_size - ) # (num_patches, config.num_channels, self.out_size, self.out_size) - high_res_features, med_res_features, low_res_features = torch.split( - features, - [len(high_res_patches), len(med_res_patches), len(low_res_patches)], - dim=0, - ) # (num_patches, config.num_channels, self.out_size, self.out_size) + # b. reshape back to image like + features = self._reshape_feature( + hidden_state, self.out_size, self.out_size + ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) - # c. merge patches back together - high_res_features = self._merge(high_res_features, batch_size=B, merge_out_size=self.out_size*4) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) - med_res_features = self._merge(med_res_features, batch_size=B, merge_out_size=self.out_size*2) # (B, config.hidden_size, self.out_size*2**1, self.out_size*2**1) - low_res_features = low_res_features # no merge required with low res image # (B, config.hidden_size, self.out_size*2**0, self.out_size*2**0) + # c. merge patches back together + features = self._merge( + features, batch_size=B, merge_out_size=self.out_size*2**i + ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) - # d. upsample - high_res_features = self.upsample_high_res(high_res_features) # (B, config.high_res_feature_dims, self.out_size*2**3, self.out_size*2**3) - med_res_features = self.upsample_med_res(med_res_features) # (B, config.med_res_feature_dims, self.out_size*2**2, self.out_size*2**2) - low_res_features = self.upsample_low_res(low_res_features) # (B, config.low_res_feature_dims, self.out_size*2**1, self.out_size*2**1) + # d. upsample + features = self.upsample_scaled_images[i](features) + # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) + + scaled_images_features.append(features) # STEP 5: get intermediate features - (1-2) in diagram intermediate_features = [] - for i, layer_id in enumerate(self.patch_encoder_hook_ids): + for i in range(self.n_intermediate_hooks): # a. extract hidden_state - hidden_state = patch_encodings.hidden_states[layer_id+1] # +1 to correct index position as hidden_states contain embedding output as well - # (num_patches, self.seq_len+1, config.hidden_size) + layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well + hidden_state = patch_encodings.hidden_states[layer_id] + hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution + # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like features = self._reshape_feature( hidden_state, self.out_size, self.out_size, - ) # (num_patches, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = self._merge( - features[: B * 5 * 5], batch_size=B, merge_out_size=self.out_size*4, - ) # (B, config.hidden_size, self.out_size*2**2, self.out_size*2**2) + features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - features = self.upsample_intermediate[layer_id](features) - # (B, config.intermediate_feature_dims[i], self.out_size*2**(3+total-i), self.out_size*2**(3+total-i)) + features = self.upsample_intermediate[i](features) + # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) intermediate_features.append(features) # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state # (num_patches, self.seq_len+1, config.hidden_size) + hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = self._reshape_feature( hidden_state, self.out_size, self.out_size - ) # (num_patches, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - # skipped, no merge required with low res image + image_features = self._merge( + image_features, batch_size=B, merge_out_size=self.out_size*2**(0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - image_features = self.upsample_image(image_features) # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) - - # STEP 7: return these features - last_hidden_state = [ - *intermediate_features, # (B, config.image_feature_dims, self.out_size*2**3+total-i, self.out_size*2**3+total-i) - high_res_features, # (B, config.image_feature_dims, self.out_size*2**3, self.out_size*2**3) - med_res_features, # (B, config.image_feature_dims, self.out_size*2**2, self.out_size*2**2) - low_res_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) - image_features, # (B, config.image_feature_dims, self.out_size*2**1, self.out_size*2**1) + image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) + + # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0]) + # fuses image_features with lowest resolution features as they are of same size + scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1) + scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) + + # STEP 8: return these features in order of increasing size as what decoder expects + last_hidden_state = [ + # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) + *scaled_images_features, + # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) + *intermediate_features, ] - # for i in last_hidden_state: - # ic(i.shape) - # exit() - - # 768, 384, 192, 96, 48, 48 - image_size=1536 - # 384, 192, 96, 48, 24, 24 - image_size=768 (ideal) - # 288, 144, 72, 24, 24, 24 - image_size=768 (practical) - # 1536, 768, 384, 192, 96, 96 - image_size=3072 (ideal) - # 1728, 864, 432, 240, 96, 96 - image_size=3072 (practical) hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None @@ -910,84 +958,133 @@ def forward( ) -class DepthProFOVModel(nn.Module): - def __init__(self, config: DepthProConfig) -> None: - super().__init__() +class DepthProPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DepthProConfig + base_model_prefix = "depth_pro" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["DepthProViTSwiGLUFFN"] + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +DEPTH_PRO_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +DEPTH_PRO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", + DEPTH_PRO_START_DOCSTRING, +) +class DepthProModel(DepthProPreTrainedModel): + def __init__(self, config): + super().__init__(config) self.config = config - self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.encoder = DepthProEncoder(config) + # Initialize weights and apply final processing + self.post_init() - self.encoder = DepthProViT(config) - self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) - self.global_neck = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True) - ) - self.head = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), - ) + def get_input_embeddings(self): + embeddings = { + "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, + "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, + } + return embeddings + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) + # TODO + # @add_code_sample_docstrings( + # checkpoint=_CHECKPOINT_FOR_DOC, + # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, + # config_class=_CONFIG_FOR_DOC, + # modality="vision", + # expected_output=_EXPECTED_OUTPUT_SHAPE, + # ) def forward( self, - pixel_values: torch.Tensor, - global_features: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: + pixel_values: torch.FloatTensor, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - pixel_values = nn.functional.interpolate( - pixel_values, - size=None, - scale_factor=0.25, - mode="bilinear", - align_corners=False, - ) - encoder_outputs = self.encoder( + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encodings = self.encoder( pixel_values, - head_mask=head_mask, + head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - - last_hidden_state = self.encoder_neck(last_hidden_state) - - last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token - last_hidden_state = last_hidden_state.permute(0, 2, 1) - - global_features = self.global_neck(global_features) - - ic(last_hidden_state.shape) - ic(global_features.shape) - # exit() - - last_hidden_state = last_hidden_state.reshape_as(global_features) - last_hidden_state = last_hidden_state + global_features - fov_output = self.head(last_hidden_state) - fov_output = fov_output.reshape(1) - - if not return_dict: - head_outputs = (fov_output,) - return head_outputs + encoder_outputs[1:] - - return BaseModelOutput( - last_hidden_state=fov_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + return encodings # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro @@ -1075,325 +1172,109 @@ def forward(self, hidden_state, residual=None): return hidden_state -# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage -class DepthProDecoder(nn.Module): - def __init__(self, config: DepthProConfig) -> None: +# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters +class DepthProFeatureFusionStage(nn.Module): + def __init__(self, config, num_layers): super().__init__() - self.config = config - - # for STEP 2: fuse low_res and image features - self.fuse_image_with_low_res = nn.Conv2d( - in_channels=config.low_res_feature_dims+config.image_feature_dims, - out_channels=config.global_feature_dims, - kernel_size=1, - stride=1, - padding=0, - bias=True, - ) - - # for STEP 3: apply decoder block for global features - self.global_proj = nn.Conv2d( - in_channels=config.global_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.global_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 4: apply decoder block for med features - self.med_res_proj = nn.Conv2d( - in_channels=config.med_res_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.med_res_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 5: apply decoder block for high features - self.high_res_proj = nn.Conv2d( - in_channels=config.high_res_feature_dims, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - self.high_res_fusion = DepthProFeatureFusionLayer(config) - - # for STEP 6: apply decoder block for intermediate features - self.intermediate_proj = nn.Sequential() - self.intermediate_fusion = nn.Sequential() - for i, feature_dim in enumerate(config.intermediate_feature_dims): - if i == 0: - # no projection for final intermediate layer - if feature_dim == config.decoder_hidden_size: - proj = nn.Identity() - else: - proj = nn.Conv2d( - in_channels=feature_dim, - out_channels=config.decoder_hidden_size, - kernel_size=1, - bias=False, - ) - fusion = DepthProFeatureFusionLayer(config, use_deconv=False) - else: - proj = nn.Conv2d( - in_channels=feature_dim, - out_channels=config.decoder_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - fusion = DepthProFeatureFusionLayer(config) - - self.intermediate_proj.append(proj) - self.intermediate_fusion.append(fusion) + self.num_layers = num_layers + self.layers = nn.ModuleList() + for _ in range(self.num_layers-1): + self.layers.append(DepthProFeatureFusionLayer(config)) + # final layer doesnot require deconvolution + self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False)) def forward(self, hidden_states): - ic("Start of Decoder") - - for i in hidden_states: - ic(i.shape) - - # STEP 1: extract features - - intermediate_features = hidden_states[:-4] - # intermediate_features_i.shape: [batch_size, config.intermediate_feature_dims_i, 768, 768], [1, 256, 384, 384] - high_res_features = hidden_states[-4] - # high_res_features.shape: [batch_size, config.high_res_feature_dims, 192, 192] - med_res_features = hidden_states[-3] - # med_res_features.shape: [batch_size, config.med_res_feature_dims, 96, 96] - low_res_features = hidden_states[-2] - # low_res_features.shape: [batch_size, config.low_res_feature_dims, 48, 48] - image_features = hidden_states[-1] - # image_features.shape: [batch_size, config.image_feature_dims, 48, 48] - - # STEP 2: fuse low_res and image features - - global_features = torch.cat((low_res_features, image_features), dim=1) - global_features = self.fuse_image_with_low_res(global_features) - # global_features.shape: [batch_size, config.global_feature_dims, 48, 48] - - # STEP 3: apply decoder block for global features - - # apply projection: used by fusion now and then fov later - global_projected = self.global_proj(global_features) - # apply fusion: used by next projections and fusions - last_features = self.global_fusion(global_projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 96, 96] - - # STEP 4: apply decoder block for med features - - projected = self.med_res_proj(med_res_features) - last_features = self.med_res_fusion(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 192, 192] - - # STEP 5: apply decoder block for high features - - projected = self.high_res_proj(high_res_features) - last_features = self.high_res_fusion(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 384, 384] - - # STEP 6: apply decoder block for intermediate features - - for (features, proj_layer, fusion_layer) in zip( - # reversed becuase decoding is applied from last features to first features - intermediate_features[::-1], - self.intermediate_proj[::-1], - self.intermediate_fusion[::-1], - ): - projected = proj_layer(features) - last_features = fusion_layer(last_features, projected) - # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] - # last_features.shape: [batch_size, config.decoder_hidden_size, 768, 768] - - return last_features, global_projected - - -class DepthProPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = DepthProConfig - base_model_prefix = "depth_pro" - main_input_name = "pixel_values" - supports_gradient_checkpointing = True - _no_split_modules = ["DepthProViTSwiGLUFFN"] - _supports_sdpa = True - - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - - -DEPTH_PRO_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and - behavior. - - Parameters: - config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - -DEPTH_PRO_INPUTS_DOCSTRING = r""" - Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`] - for details. - - head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. -""" - + if self.num_layers != len(hidden_states): + raise ValueError( + f"num_layers={self.num_layers} in DepthProFeatureFusionStage" + f"doesnot match len(hidden_states)={len(hidden_states)}" + ) -@dataclass -class DepthProModelOutput(BaseModelOutput): - """ - Base class for model's outputs, with potential fov, hidden states and attentions. + # first layer only uses the last hidden_state + fused_hidden_state = self.layers[0](hidden_states[0]) + # looping from the second layer to last layer + for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): + fused_hidden_state = layer(fused_hidden_state, hidden_state) - Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): - Field of View Scaler. - """ - fov: Optional[torch.FloatTensor] = None + return fused_hidden_state -@add_start_docstrings( - "The bare DepthPro Model transformer outputting raw hidden-states without any specific head on top.", - DEPTH_PRO_START_DOCSTRING, -) -class DepthProModel(DepthProPreTrainedModel): - def __init__(self, config, use_fov_model=None): - super().__init__(config) +class DepthProFOVModel(nn.Module): + def __init__(self, config: DepthProConfig) -> None: + super().__init__() self.config = config - self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - - # dinov2 (vit) like encoder - self.encoder = DepthProEncoder(config) - # dpt (vit) like decoder - self.decoder = DepthProDecoder(config) - # dinov2 (vit) like encoder - self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - embeddings = { - "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, - "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, - } - if self.use_fov: - embeddings['fov_embeddings'] = self.fov_model.embeddings.patch_embeddings - return embeddings + self.hidden_size = config.hidden_size + self.decoder_hidden_size = config.decoder_hidden_size - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) - self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) - self.fov_model.encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder = DepthProViT(config) + self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.global_neck = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(True) + ) + self.head = nn.Sequential( + nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), + nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + ) - @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # TODO - # @add_code_sample_docstrings( - # checkpoint=_CHECKPOINT_FOR_DOC, - # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, - # config_class=_CONFIG_FOR_DOC, - # modality="vision", - # expected_output=_EXPECTED_OUTPUT_SHAPE, - # ) def forward( self, - pixel_values: torch.FloatTensor, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - encodings = self.encoder( + pixel_values = nn.functional.interpolate( pixel_values, - head_mask, + size=None, + scale_factor=0.25, + mode="bilinear", + align_corners=False, + ) + encoder_outputs = self.encoder( + pixel_values, + head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=True, + return_dict=return_dict, ) + last_hidden_state = encoder_outputs[0] - last_hidden_state = encodings.last_hidden_state - last_hidden_state, global_features = self.decoder(last_hidden_state) + last_hidden_state = self.encoder_neck(last_hidden_state) - if self.use_fov_model: - fov_encodings = self.fov_model( - pixel_values=pixel_values, - global_features=global_features.detach(), - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - fov = fov_encodings.last_hidden_state - attentions = encodings.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = encodings.hidden_states + fov_encodings.hidden_states if output_hidden_states else None - else: - fov = None - attentions = encodings.attentions - hidden_states = encodings.hidden_states + last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token + last_hidden_state = last_hidden_state.permute(0, 2, 1) + + global_features = self.global_neck(global_features) + + ic(last_hidden_state.shape) + ic(global_features.shape) + + + last_hidden_state = last_hidden_state.reshape_as(global_features) + last_hidden_state = last_hidden_state + global_features + fov_output = self.head(last_hidden_state) + fov_output = fov_output.reshape(1) if not return_dict: - outputs = (last_hidden_state, fov, hidden_states, attentions) - outputs = (i for i in outputs if i is not None) - return outputs + head_outputs = (fov_output,) + return head_outputs + encoder_outputs[1:] - return DepthProModelOutput( - last_hidden_state=last_hidden_state, - fov=fov, - hidden_states=hidden_states, - attentions=attentions, + return BaseModelOutput( + last_hidden_state=fov_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, ) @@ -1422,7 +1303,6 @@ def __init__(self, config): nn.ReLU(), ) - def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: predicted_depth = self.head(hidden_states) predicted_depth = predicted_depth.squeeze(dim=1) @@ -1450,14 +1330,45 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): class DepthProForDepthEstimation(DepthProPreTrainedModel): def __init__(self, config, use_fov_model=None): super().__init__(config) + self.config = config self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model - self.depth_pro = DepthProModel(config, use_fov_model=self.use_fov_model) + # dinov2 (vit) like encoders + self.depth_pro = DepthProModel(config) + + # project hidden states from encoder to match expected inputs in fusion stage + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.decoder_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + # dpt (vit) like fusion stage + self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers) + + # depth estimation head self.head = DepthProDepthEstimationHead(config) + # dinov2 (vit) like encoder + self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None + # Initialize weights and apply final processing self.post_init() + @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1476,6 +1387,7 @@ def forward( Returns: Examples: + TODO ```python >>> from transformers import AutoImageProcessor, DPTForDepthEstimation >>> import torch @@ -1526,21 +1438,39 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, ) - last_hidden_state = depth_pro_outputs[0] - ic(last_hidden_state.shape) - predicted_depth = self.head(last_hidden_state) - ic(predicted_depth.shape) + last_hidden_state = depth_pro_outputs.last_hidden_state + last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)] + fused_state = self.fusion_stage(last_hidden_state) + predicted_depth = self.head(fused_state) + + if self.use_fov_model: + # use lowest scaled image features for fov model + global_features = last_hidden_state[0].detach() + fov_encodings = self.fov_model( + pixel_values=pixel_values, + global_features=global_features, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + fov = fov_encodings.last_hidden_state + attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None + hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + else: + fov = None + attentions = depth_pro_outputs.attentions + hidden_states = depth_pro_outputs.hidden_states if not return_dict: - if loss is None: - return (predicted_depth,) + depth_pro_outputs[1:] - else: - return (loss, predicted_depth) + depth_pro_outputs[1:] + outputs = (predicted_depth, fov, hidden_states, attentions) + outputs = (i for i in outputs if i is not None) + return outputs return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, - fov=depth_pro_outputs.fov, - hidden_states=depth_pro_outputs.hidden_states, - attentions=depth_pro_outputs.attentions, + fov=fov, + hidden_states=hidden_states, + attentions=attentions, ) From 6be242ce30589132e71bd437fd6016827c3d8b6a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 13:51:45 +0500 Subject: [PATCH 015/151] make fov model compatible with custom config --- .../depth_pro/configuration_depth_pro.py | 2 + .../models/depth_pro/modeling_depth_pro.py | 267 ++++++++++-------- 2 files changed, 150 insertions(+), 119 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 055830900417..8e197dbd0dab 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -138,6 +138,7 @@ def __init__( scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_decoder=False, use_fov_model=False, + num_fov_head_layers=2, **kwargs, ): super().__init__(**kwargs) @@ -168,6 +169,7 @@ def __init__( self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_decoder = use_batch_norm_in_decoder self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids self.intermediate_feature_dims = intermediate_feature_dims self.scaled_images_ratios = scaled_images_ratios diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 9f146177402c..0ddd503c4cc9 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -610,6 +610,97 @@ def forward(self, features): projected = self.proj(features) return self.upsample_blocks(projected) + +def interpolate(pixel_values, scale_factor): + return nn.functional.interpolate( + pixel_values, + size=None, + scale_factor=scale_factor, + mode="bilinear", + align_corners=False, + ) + +def patch(pixel_values, patch_size, overlap_ratio): + """Creates Patches from Batch.""" + B, C, W, H = pixel_values.shape + + if W == H == patch_size: + # create patches only if scaled image is not already equal to patch size + return pixel_values + + stride = int(patch_size * (1 - overlap_ratio)) + + # (B, C, W, H) + patches = torch.nn.functional.unfold( + pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) + ) + # patches.shape (B, patch_size**2 * C, num_patches) + patches = patches.permute(2, 0, 1) + # patches.shape (num_patches, B, patch_size**2 * C) + patches = patches.reshape(-1, C, patch_size, patch_size) + # patches.shape (B * num_patches, C, patch_size, patch_size) + + return patches + +def reshape_feature(hidden_states, width, height): + """Discard class token and reshape 1D feature map to a 2D grid.""" + B, _, C = hidden_states.shape + # (B, WH+1, C) + hidden_states = hidden_states[:, 1:, :] # remove class token + # (B, WH, C) + hidden_states = hidden_states.reshape(B, width, height, C) + # (B, W, H, C) + hidden_states = hidden_states.permute(0, 3, 1, 2) + # (B, C, W, H) + return hidden_states + +def merge(patches, batch_size, merge_out_size): + """Recreates Batch from Patches.""" + num_patches, num_channels, out_size, out_size = patches.shape + + if num_patches == batch_size: + # merge only if the patches were created from scaled image + # patches are not created when scaled image size is equal to patch size + return patches + + box_size = int(math.sqrt(num_patches // batch_size)) + """ + merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) + padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) + """ + padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 ) + + i = 0 + boxes = [] + for h in range(box_size): + boxes_in_row = [] + for w in range(box_size): + box = patches[batch_size * i : batch_size * (i + 1)] + + if h != 0: + # remove pad from height if box is not at top border + box = box[..., padding:, :] + if w != 0: + # remove pad from width if box is not at left border + box = box[..., :, padding:] + if h != box_size - 1: + # remove pad from height if box is not at bottom border + box = box[..., :box.shape[-2]-padding, :] + if w != box_size - 1: + # remove pad from width if box is not at right border + box = box[..., :, :box.shape[-1]-padding] + + boxes_in_row.append(box) + i += 1 + + boxes_in_row = torch.cat(boxes_in_row, dim=-1) + boxes.append(boxes_in_row) + + boxes = torch.cat(boxes, dim=-2) + boxes = boxes[..., :merge_out_size, :merge_out_size] + return boxes + + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -700,96 +791,6 @@ def __init__(self, config: DepthProConfig) -> None: bias=True, ) - def _interpolate(self, pixel_values, scale_factor): - if scale_factor == 1: - return pixel_values - return nn.functional.interpolate( - pixel_values, - size=None, - scale_factor=scale_factor, - mode="bilinear", - align_corners=False, - ) - - def _patch(self, pixel_values, overlap_ratio): - if pixel_values.shape[-1] == self.config.patch_size: - # create patches only if scaled image is not already equal to patch size - return pixel_values - - patch_size = self.config.patch_size - stride = int(patch_size * (1 - overlap_ratio)) - - # pixel_values.shape (B, config.num_channels, config.image_size, config.image_size) - patches = torch.nn.functional.unfold( - pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) - ) - # patches.shape (B, -1, num_patches) - patches = patches.permute(2, 0, 1) - # patches.shape (num_patches, B, -1) - patches = patches.reshape(-1, self.config.num_channels, patch_size, patch_size) - # patches.shape (B * num_patches, config.num_channels, config.patch_size, config.patch_size) - - return patches - - def _reshape_feature( - self, hidden_states: torch.Tensor, width, height, cls_token_offset=1 - ): - """Discard class token and reshape 1D feature map to a 2D grid.""" - b, hw, c = hidden_states.shape - - # Remove class token. - if cls_token_offset > 0: - hidden_states = hidden_states[:, cls_token_offset:, :] - - # Shape: (batch, height, width, dim) -> (batch, dim, height, width) - hidden_states = hidden_states.reshape(b, height, width, c).permute(0, 3, 1, 2) - return hidden_states - - def _merge(self, x: torch.Tensor, batch_size: int, merge_out_size: int) -> torch.Tensor: - if batch_size == x.shape[0]: - # merge only if the patches were created from this scaled image - # pathces are not created when scaled image size is equal to patch size - return x - - # x.shape (num_patches, config.num_channels, self.out_size, self.out_size) - box_size = int(math.sqrt(x.shape[0] // batch_size)) - - """ - merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) - padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) - """ - padding = ( box_size * self.out_size - merge_out_size ) // ( 2 * box_size - 2 ) - - i = 0 - boxes = [] - for h in range(box_size): - boxes_in_row = [] - for w in range(box_size): - box = x[batch_size * i : batch_size * (i + 1)] - - if h != 0: - # remove pad from height if box is not at top border - box = box[..., padding:, :] - if w != 0: - # remove pad from width if box is not at left border - box = box[..., :, padding:] - if h != box_size - 1: - # remove pad from height if box is not at bottom border - box = box[..., :box.shape[-2]-padding, :] - if w != box_size - 1: - # remove pad from width if box is not at right border - box = box[..., :, :box.shape[-1]-padding] - - boxes_in_row.append(box) - i += 1 - - boxes_in_row = torch.cat(boxes_in_row, dim=-1) - boxes.append(boxes_in_row) - - boxes = torch.cat(boxes, dim=-2) - boxes = boxes[..., :merge_out_size, :merge_out_size] - return boxes - def forward( self, pixel_values: torch.Tensor, @@ -825,14 +826,15 @@ def forward( scaled_images = [] for ratio in self.scaled_images_ratios: - scaled_images.append(self._interpolate(pixel_values, ratio)) + scaled_images.append(interpolate(pixel_values, ratio)) # (B, config.num_channels, config.image_size * ratio, config.image_size * ratio) # STEP 2: create patches for i in range(self.n_scaled_images): - scaled_images[i] = self._patch( + scaled_images[i] = patch( scaled_images[i], + patch_size=self.config.patch_size, overlap_ratio=self.scaled_images_overlap_ratios[i], ) scaled_images_num_patches = [len(i) for i in scaled_images] @@ -870,12 +872,12 @@ def forward( # (scaled_images_num_patches[i], self.seq_len+1, config.hidden_size) # b. reshape back to image like - features = self._reshape_feature( + features = reshape_feature( hidden_state, self.out_size, self.out_size ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) # c. merge patches back together - features = self._merge( + features = merge( features, batch_size=B, merge_out_size=self.out_size*2**i ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) @@ -897,14 +899,14 @@ def forward( # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like - features = self._reshape_feature( + features = reshape_feature( hidden_state, self.out_size, self.out_size, ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - features = self._merge( + features = merge( features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) @@ -920,12 +922,12 @@ def forward( hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like - image_features = self._reshape_feature( + image_features = reshape_feature( hidden_state, self.out_size, self.out_size ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = self._merge( + image_features = merge( image_features, batch_size=B, merge_out_size=self.out_size*2**(0), ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) @@ -1206,18 +1208,39 @@ def __init__(self, config: DepthProConfig) -> None: self.hidden_size = config.hidden_size self.decoder_hidden_size = config.decoder_hidden_size + self.out_size = config.patch_size // config.patch_embeddings_size + self.encoder = DepthProViT(config) self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) self.global_neck = nn.Sequential( nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) - self.head = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size // 2, self.decoder_hidden_size // 4, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 4, self.decoder_hidden_size // 8, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - nn.Conv2d(self.decoder_hidden_size // 8, 1, kernel_size=6, stride=1, padding=0), + + if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0: + raise ValueError( + f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " + "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0" + ) + + # create initial head layers + self.head = nn.Sequential() + for i in range(config.num_fov_head_layers): + self.head.append( + nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + ) + self.head.append(nn.ReLU(True)) + # calculate expected shapes to finally generate a scalar output from final head layer + final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1) + final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + self.head.append( + nn.Conv2d( + in_channels=final_in_channels, + out_channels=1, + kernel_size=final_kernal_size, + stride=1, + padding=0 + ) ) def forward( @@ -1235,34 +1258,40 @@ def forward( ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - pixel_values = nn.functional.interpolate( + B, C, W, H = pixel_values.shape + + # follow the steps same as with image features in DepthProEncoder + pixel_values = interpolate( pixel_values, - size=None, - scale_factor=0.25, - mode="bilinear", - align_corners=False, + scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image ) - encoder_outputs = self.encoder( + patches = patch( pixel_values, + patch_size=self.config.patch_size, + overlap_ratio=self.config.scaled_images_overlap_ratios[0], + ) + encoder_outputs = self.encoder( + patches, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] - last_hidden_state = self.encoder_neck(last_hidden_state) - - last_hidden_state = last_hidden_state[:, 1:] # ignore cls_token - last_hidden_state = last_hidden_state.permute(0, 2, 1) + last_hidden_state = reshape_feature( + last_hidden_state, + width=self.out_size, + height=self.out_size + ) + last_hidden_state = merge( + last_hidden_state, + batch_size=B, + merge_out_size=self.out_size, + ) global_features = self.global_neck(global_features) - ic(last_hidden_state.shape) - ic(global_features.shape) - - - last_hidden_state = last_hidden_state.reshape_as(global_features) last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(1) From 01891085f0961ea28049616abed63a8bd9cb2f05 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 21 Nov 2024 13:54:43 +0500 Subject: [PATCH 016/151] replace word "decoder" with "fusion" --- .../depth_pro/configuration_depth_pro.py | 10 ++--- .../models/depth_pro/modeling_depth_pro.py | 44 +++++++++---------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 8e197dbd0dab..f124d3e5b71a 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -59,7 +59,7 @@ class DepthProConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. image_size (`int`, *optional*, defaults to 224): - TODO: image_size / 2**n_decoder_blocks = patch_size / patch_embeddings_size + TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size The size (resolution) of each image. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. @@ -110,7 +110,7 @@ class DepthProConfig(PretrainedConfig): def __init__( self, hidden_size=1024, - decoder_hidden_size=256, + fusion_hidden_size=256, num_hidden_layers=24, num_attention_heads=16, mlp_ratio=4, @@ -136,7 +136,7 @@ def __init__( scaled_images_ratios = [0.25, 0.5, 1], scaled_images_overlap_ratios = [0.0, 0.5, 0.25], scaled_images_feature_dims = [1024, 1024, 512], - use_batch_norm_in_decoder=False, + use_batch_norm_in_fusion=False, use_fov_model=False, num_fov_head_layers=2, **kwargs, @@ -144,7 +144,7 @@ def __init__( super().__init__(**kwargs) self.hidden_size = hidden_size - self.decoder_hidden_size = decoder_hidden_size + self.fusion_hidden_size = fusion_hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.mlp_ratio = mlp_ratio @@ -167,7 +167,7 @@ def __init__( ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.use_batch_norm_in_decoder = use_batch_norm_in_decoder + self.use_batch_norm_in_fusion = use_batch_norm_in_fusion self.use_fov_model = use_fov_model self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 0ddd503c4cc9..0ac35b582d7f 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -706,7 +706,7 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.fusion_hidden_size = config.fusion_hidden_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -762,7 +762,7 @@ def __init__(self, config: DepthProConfig) -> None: # upsampling intermediate features - (1-2) in diagram self.upsample_intermediate = nn.ModuleList() for i, feature_dims in enumerate(self.intermediate_feature_dims): - intermediate_dims = self.decoder_hidden_size if i == 0 else feature_dims + intermediate_dims = self.fusion_hidden_size if i == 0 else feature_dims upsample_block = DepthProUpsampleBlock( input_dims=config.hidden_size, intermediate_dims=intermediate_dims, @@ -939,7 +939,7 @@ def forward( scaled_images_features[0] = torch.cat((scaled_images_features[0], image_features), dim=1) scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) - # STEP 8: return these features in order of increasing size as what decoder expects + # STEP 8: return these features in order of increasing size as what fusion expects last_hidden_state = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) *scaled_images_features, @@ -1094,8 +1094,8 @@ class DepthProResidualLayer(nn.Module): def __init__(self, config): super().__init__() - self.use_batch_norm = config.use_batch_norm_in_decoder - self.hidden_size = config.decoder_hidden_size + self.use_batch_norm = config.use_batch_norm_in_fusion + self.hidden_size = config.fusion_hidden_size self.activation1 = nn.ReLU() self.convolution1 = nn.Conv2d( @@ -1151,15 +1151,15 @@ def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: if self.use_deconv: self.deconv = nn.ConvTranspose2d( - in_channels=config.decoder_hidden_size, - out_channels=config.decoder_hidden_size, + in_channels=config.fusion_hidden_size, + out_channels=config.fusion_hidden_size, kernel_size=2, stride=2, padding=0, bias=False, ) - self.projection = nn.Conv2d(config.decoder_hidden_size, config.decoder_hidden_size, kernel_size=1, bias=True) + self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) self.skip_add = nn.quantized.FloatFunctional() def forward(self, hidden_state, residual=None): @@ -1206,32 +1206,32 @@ def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size - self.decoder_hidden_size = config.decoder_hidden_size + self.fusion_hidden_size = config.fusion_hidden_size self.out_size = config.patch_size // config.patch_embeddings_size self.encoder = DepthProViT(config) - self.encoder_neck = nn.Linear(self.hidden_size, self.decoder_hidden_size // 2) + self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2) self.global_neck = nn.Sequential( - nn.Conv2d(self.decoder_hidden_size, self.decoder_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True) ) - if config.decoder_hidden_size // 2**config.num_fov_head_layers == 0: + if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0: raise ValueError( - f"decoder_hidden_size={config.decoder_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " - "i.e config.decoder_hidden_size // 2**config.num_fov_head_layers > 0" + f"fusion_hidden_size={config.fusion_hidden_size} should be consistent with config.num_fov_head_layers={config.num_fov_head_layers} " + "i.e config.fusion_hidden_size // 2**config.num_fov_head_layers > 0" ) # create initial head layers self.head = nn.Sequential() for i in range(config.num_fov_head_layers): self.head.append( - nn.Conv2d(self.decoder_hidden_size // 2**(i+1), self.decoder_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.decoder_hidden_size // 2**(config.num_fov_head_layers+1) + final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( @@ -1311,7 +1311,7 @@ class DepthProDepthEstimationHead(nn.Module): """ The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks. This module comprises a sequence of convolutional and transposed convolutional layers - that process the feature map from the decoder to produce a single-channel depth map. + that process the feature map from the fusion to produce a single-channel depth map. Key operations include dimensionality reduction and upsampling to match the input resolution. """ @@ -1319,7 +1319,7 @@ def __init__(self, config): super().__init__() self.config = config - features = config.decoder_hidden_size + features = config.fusion_hidden_size self.head = nn.Sequential( nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( @@ -1369,14 +1369,14 @@ def __init__(self, config, use_fov_model=None): combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims self.projections = nn.ModuleList() for i, in_channels in enumerate(combined_feature_dims): - if i == len(combined_feature_dims)-1 and in_channels == config.decoder_hidden_size: + if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size: # projection for last layer can be ignored if input and output channels already match self.projections.append(nn.Identity()) else: self.projections.append( nn.Conv2d( in_channels=in_channels, - out_channels=config.decoder_hidden_size, + out_channels=config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, @@ -1385,8 +1385,8 @@ def __init__(self, config, use_fov_model=None): ) # dpt (vit) like fusion stage - self.num_decoder_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) - self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_decoder_layers) + self.num_fusion_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + self.fusion_stage = DepthProFeatureFusionStage(config, num_layers=self.num_fusion_layers) # depth estimation head self.head = DepthProDepthEstimationHead(config) From 7614e1a709c14c8f9e32730fe240e401ae023ec3 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sun, 24 Nov 2024 13:57:36 +0500 Subject: [PATCH 017/151] weight conversion script --- .../depth_pro/convert_depth_pro_to_hf.py | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py new file mode 100644 index 000000000000..38b7a7853d76 --- /dev/null +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert DepthPro checkpoints from the original repository. + +URL: https://huggingface.co/apple/DepthPro/tree/main +""" + +import argparse +import json +from pathlib import Path +import re + +import requests +import torch +import torch.nn as nn +from huggingface_hub import hf_hub_download +from PIL import Image +from torchvision import transforms + +from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model +from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.utils import logging + +# TODO: import directly from transformers +from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig +from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def create_vit_rename_keys(config): + rename_keys = [] + # fmt: off + + # patch embedding layer + rename_keys.append(("cls_token", "embeddings.cls_token")) + rename_keys.append(("pos_embed", "embeddings.position_embeddings")) + rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) + + for i in range(config.num_hidden_layers): + # layernorms + rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) + rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) + rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) + rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) + # MLP + if config.use_swiglu_ffn: + rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) + rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) + rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) + rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) + else: + rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) + rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) + rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) + # layerscale + rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) + rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) + # attention projection layer + rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) + rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) + + # final layernorm + rename_keys.append(("norm.weight", "layernorm.weight")) + rename_keys.append(("norm.bias", "layernorm.bias")) + + # fmt: on + return rename_keys + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + state_dict_keys = state_dict.keys() + for key in list(state_dict_keys): + if "qkv" in key: + in_proj = state_dict.pop(key) + q, k, v = torch.split(in_proj, config.hidden_size, dim=0) + + if "fov" in key: + key = key.replace('fov.encoder.0', 'fov_model.encoder') + else: + key = "depth_pro." + key + + key = key.replace("blocks", "encoder.layer") + state_dict[key.replace("attn.qkv", "attention.attention.query")] = q + state_dict[key.replace("attn.qkv", "attention.attention.key")] = k + state_dict[key.replace("attn.qkv", "attention.attention.value")] = v + return state_dict + +# hard coded upsample keys +def update_hard_coded_keys(state_dict): + mapping = [ + # upsamples + ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'), + ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'), + ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'), + ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'), + ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'), + ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'), + ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'), + ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'), + ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'), + ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'), + ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'), + ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'), + ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'), + ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'), + ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'), + + # neck + ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"), + ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"), + ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"), + ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"), + ] + for src, dest in mapping: + state_dict[dest] = state_dict.pop(src) + + return state_dict + + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + + +@torch.no_grad() +def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our DepthPro structure. + """ + + # define default DepthPro configuration + config = DepthProConfig() + + # load original weights from huggingface hub + # TODO: download from hub + # file_path = hf_hub_download(repo_id, filename) + file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" + state_dict = torch.load(file_path, weights_only=True) + + # enumerate fusion layers + n_scaled_images = len(config.scaled_images_ratios) # 3 + n_intermediate_hooks = len(config.intermediate_hook_ids) # 2 + n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5 + + # 1. keys for vit encoders + vit_rename_keys = create_vit_rename_keys(config) + for src_prefix, dest_prefix in [ + ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"), + ("encoder.image_encoder", "depth_pro.encoder.image_encoder"), + ("fov.encoder.0", "fov_model.encoder"), + ]: + for src, dest in vit_rename_keys: + src = src_prefix + "." + src + dest = dest_prefix + "." + dest + state_dict[dest] = state_dict.pop(src) + + # 2. qkv keys for vit encoders + state_dict = read_in_q_k_v(state_dict, config) + + # 3. hard coded mapping + state_dict = update_hard_coded_keys(state_dict) + + + for key in list(state_dict.keys()): + + # 4. final depth estimation head + if key.startswith("head."): + new_key = "head." + key + + # 5. fov model head + elif key.startswith("fov.head."): + new_key = key.replace("fov", 'fov_model') + + # 6. projections between encoder and fusion + elif "decoder.convs." in key: + n = re.findall(r'\d+', key)[0] # find digit inside string + n = n_fusion_layers - int(n) - 1 + new_key = f"projections.{n}.weight" + + # 7. fuse low res with image features + elif "encoder.fuse_lowres." in key: + new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res") + + # 8. fusion stage (decoder) + elif key.startswith("decoder.fusions."): + new_key = key.replace("decoder.fusions.", "fusion_stage.layers.") + new_key = new_key.replace("resnet1", "residual_layer1") + new_key = new_key.replace("resnet2", "residual_layer2") + new_key = new_key.replace("residual.1", "convolution1") + new_key = new_key.replace("residual.3", "convolution2") + new_key = new_key.replace("out_conv", "projection") + + n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by . + n = n_with_dots[1:-1] + n = n_fusion_layers - int(n) - 1 + new_key = new_key.replace(n_with_dots, f".{n}.") + + else: + continue + + state_dict[new_key] = state_dict.pop(key) + + model = DepthProForDepthEstimation(config, use_fov_model=True).eval() + model.load_state_dict(state_dict) + + exit() + + # ---------------- + + + + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if "w12" in key: + key = key.replace("w12", "weights_in") + if "w3" in key: + key = key.replace("w3", "weights_out") + state_dict[key] = val + + # load HuggingFace model + if image_classifier: + model = Dinov2ForImageClassification(config).eval() + model.dinov2.load_state_dict(state_dict) + model_name_to_classifier_dict_url = { + "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", + "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", + "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", + "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", + } + url = model_name_to_classifier_dict_url[model_name] + classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") + model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) + model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) + else: + model = Dinov2Model(config).eval() + model.load_state_dict(state_dict) + + # load image + image = prepare_img() + + # preprocess image + transformations = transforms.Compose( + [ + transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values + std=IMAGENET_DEFAULT_STD, # across a large photo dataset. + ), + ] + ) + + original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + + processor = BitImageProcessor( + size={"shortest_edge": 256}, + resample=PILImageResampling.BICUBIC, + image_mean=IMAGENET_DEFAULT_MEAN, + image_std=IMAGENET_DEFAULT_STD, + ) + pixel_values = processor(image, return_tensors="pt").pixel_values + + assert torch.allclose(original_pixel_values, pixel_values) + + with torch.no_grad(): + outputs = model(pixel_values, output_hidden_states=True) + original_outputs = original_model(pixel_values) + + # assert values + if image_classifier: + print("Predicted class:") + class_idx = outputs.logits.argmax(-1).item() + print(model.config.id2label[class_idx]) + else: + assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape + assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model_name_to_hf_name = { + "dinov2_vits14": "dinov2-small", + "dinov2_vitb14": "dinov2-base", + "dinov2_vitl14": "dinov2-large", + "dinov2_vitg14": "dinov2-giant", + "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", + "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", + "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", + "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", + } + + name = model_name_to_hf_name[model_name] + model.push_to_hub(f"facebook/{name}") + processor.push_to_hub(f"facebook/{name}") + + +convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False) +exit() +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." + ) + parser.add_argument( + "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert." + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) From 7d323ce91f071cc5ed6b0c36f407866e545dbe65 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:41:13 +0500 Subject: [PATCH 018/151] fix fov squeeze --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 0ac35b582d7f..eb8bf02f83d1 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1294,7 +1294,7 @@ def forward( last_hidden_state = last_hidden_state + global_features fov_output = self.head(last_hidden_state) - fov_output = fov_output.reshape(1) + fov_output = fov_output.reshape(B) if not return_dict: head_outputs = (fov_output,) From 6aaa59e943c5d5fd5c301404aaa47e8db1402355 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:42:18 +0500 Subject: [PATCH 019/151] update conversion script (without test) --- .../depth_pro/convert_depth_pro_to_hf.py | 160 +++++++----------- 1 file changed, 59 insertions(+), 101 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py index 38b7a7853d76..de7bf395a355 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -83,6 +83,7 @@ def create_vit_rename_keys(config): # fmt: on return rename_keys + # we split up the matrix of each encoder layer into queries, keys and values def read_in_q_k_v(state_dict, config): state_dict_keys = state_dict.keys() @@ -102,6 +103,7 @@ def read_in_q_k_v(state_dict, config): state_dict[key.replace("attn.qkv", "attention.attention.value")] = v return state_dict + # hard coded upsample keys def update_hard_coded_keys(state_dict): mapping = [ @@ -134,13 +136,24 @@ def update_hard_coded_keys(state_dict): return state_dict - # We will verify our results on an image of cute cats -def prepare_img(): +def inference_test(processor, model): url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - return image + inputs = processor(image) + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = outputs.predicted_depth + fov = outputs.fov + + predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov) + + print("predicted_depth.shape:", predicted_depth.shape) + print("fov.shape:", fov.shape) + print("fov:", fov) + print("Inference was Successfull!") @torch.no_grad() @@ -150,12 +163,10 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu """ # define default DepthPro configuration - config = DepthProConfig() + config = DepthProConfig(use_fov_model=True) # load original weights from huggingface hub - # TODO: download from hub - # file_path = hf_hub_download(repo_id, filename) - file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" + file_path = hf_hub_download(repo_id, filename) state_dict = torch.load(file_path, weights_only=True) # enumerate fusion layers @@ -224,108 +235,50 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu model = DepthProForDepthEstimation(config, use_fov_model=True).eval() model.load_state_dict(state_dict) - exit() - - # ---------------- + # TODO + processor = ... + # inference_test(processor, model) - + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + # TODO + # print(f"Saving image processor to {pytorch_dump_folder_path}") + # processor.save_pretrained(pytorch_dump_folder_path) - for key, val in state_dict.copy().items(): - val = state_dict.pop(key) - if "w12" in key: - key = key.replace("w12", "weights_in") - if "w3" in key: - key = key.replace("w3", "weights_out") - state_dict[key] = val - - # load HuggingFace model - if image_classifier: - model = Dinov2ForImageClassification(config).eval() - model.dinov2.load_state_dict(state_dict) - model_name_to_classifier_dict_url = { - "dinov2_vits14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth", - "dinov2_vitb14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth", - "dinov2_vitl14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth", - "dinov2_vitg14_1layer": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth", - } - url = model_name_to_classifier_dict_url[model_name] - classifier_state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu") - model.classifier.weight = nn.Parameter(classifier_state_dict["weight"]) - model.classifier.bias = nn.Parameter(classifier_state_dict["bias"]) - else: - model = Dinov2Model(config).eval() - model.load_state_dict(state_dict) - - # load image - image = prepare_img() - - # preprocess image - transformations = transforms.Compose( - [ - transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize( - mean=IMAGENET_DEFAULT_MEAN, # these are RGB mean+std values - std=IMAGENET_DEFAULT_STD, # across a large photo dataset. - ), - ] - ) - original_pixel_values = transformations(image).unsqueeze(0) # insert batch dimension + # TODO + # if push_to_hub: + # model.push_to_hub("...") + # processor.push_to_hub("...") - processor = BitImageProcessor( - size={"shortest_edge": 256}, - resample=PILImageResampling.BICUBIC, - image_mean=IMAGENET_DEFAULT_MEAN, - image_std=IMAGENET_DEFAULT_STD, - ) - pixel_values = processor(image, return_tensors="pt").pixel_values - assert torch.allclose(original_pixel_values, pixel_values) +""" +- create files locally using function +```py +convert_depth_pro_checkpoint( + "apple/DepthPro", + "depth_pro.pt", + "my_local_dump", + False, +) +``` + +- create files locally using command line args +```cmd +python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ + --repo_id "apple/DepthPro" \ + --filename "depth_pro.pt" \ + --pytorch_dump_folder_path "my_local_dump" \ + --push_to_hub 0 +``` +""" - with torch.no_grad(): - outputs = model(pixel_values, output_hidden_states=True) - original_outputs = original_model(pixel_values) - - # assert values - if image_classifier: - print("Predicted class:") - class_idx = outputs.logits.argmax(-1).item() - print(model.config.id2label[class_idx]) - else: - assert outputs.last_hidden_state[:, 0].shape == original_outputs.shape - assert torch.allclose(outputs.last_hidden_state[:, 0], original_outputs, atol=1e-3) - print("Looks ok!") - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model {model_name} to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - model_name_to_hf_name = { - "dinov2_vits14": "dinov2-small", - "dinov2_vitb14": "dinov2-base", - "dinov2_vitl14": "dinov2-large", - "dinov2_vitg14": "dinov2-giant", - "dinov2_vits14_1layer": "dinov2-small-imagenet1k-1-layer", - "dinov2_vitb14_1layer": "dinov2-base-imagenet1k-1-layer", - "dinov2_vitl14_1layer": "dinov2-large-imagenet1k-1-layer", - "dinov2_vitg14_1layer": "dinov2-giant-imagenet1k-1-layer", - } - - name = model_name_to_hf_name[model_name] - model.push_to_hub(f"facebook/{name}") - processor.push_to_hub(f"facebook/{name}") - - -convert_depth_pro_checkpoint("apple/DepthPro", "depth_pro.pt", "yooo_torch_dump", False) -exit() if __name__ == "__main__": parser = argparse.ArgumentParser() + # Required parameters parser.add_argument( "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." @@ -341,4 +294,9 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu ) args = parser.parse_args() - convert_depth_pro_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) + convert_depth_pro_checkpoint( + args.repo_id, + args.filename, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) From 263b773db7ac897a6a610e15a3fc5be0b79615da Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 25 Nov 2024 16:47:17 +0500 Subject: [PATCH 020/151] upload ruff image processing --- .../depth_pro/image_processing_depth_pro.py | 397 ++++++++++++++++++ 1 file changed, 397 insertions(+) create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro.py diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py new file mode 100644 index 000000000000..883c50ebfe6f --- /dev/null +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -0,0 +1,397 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for DepthPro.""" + +from typing import Dict, List, Optional, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np +from icecream import ic + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, logging + +import math +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union + + +if TYPE_CHECKING: + from ...modeling_outputs import DepthEstimatorOutput + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import pad, resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_torch_available, + is_torch_tensor, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_vision_available, + logging, + requires_backends, +) + +from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput + + +if is_torch_available(): + import torch + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class DepthProImageProcessor(BaseImageProcessor): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + + # ic(image.dtype) + # ic(type(image)) + # ic(image.shape) + # ic(image.mean()) + # ic(image.std()) + # ic(image.min()) + # ic(image.max()) + # ic(output_size) + # ic(resample) + # ic(data_format) + # ic(input_data_format) + # # exit() + + # return torch.nn.functional.interpolate( + # input=torch.from_numpy(image), + # size=output_size, + # mode=resample, + # align_corners=True, + # ) + + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + + size = size if size is not None else self.size + size_dict = get_size_dict(size) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # TODO + # depth-pro image preprocessing scales the image before resizing it + + if do_resize: + images = [ + self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + predicted_depth, + fov=None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + self.size = { + 'width': 3024, + 'height': 2268, + } + W = self.size['width'] + H = self.size['height'] + + if (fov is not None) and (len(predicted_depth) != len(fov)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + output_depths = [] + output_fovs = None if fov is None else [] + fov = [None] * len(predicted_depth) if fov is None else fov + for depth, fov_value in zip(predicted_depth, fov): + + if fov_value is not None: + fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * W / fov_value + + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False + ).squeeze() + + if fov_value is not None: + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + output_fovs.append(fov_value) + + output_depths.append(depth) + + return output_depths, output_fovs From 17e5487ce6782998aaccb8a8799b9495d7d545bd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 09:35:52 +0500 Subject: [PATCH 021/151] create fast image processing --- .../image_processing_depth_pro_fast.py | 362 ++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 src/transformers/models/depth_pro/image_processing_depth_pro_fast.py diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py new file mode 100644 index 000000000000..8860f2e86830 --- /dev/null +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -0,0 +1,362 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for DepthPro.""" + +import functools +from typing import Dict, List, Optional, Union + +from ...image_processing_base import BatchFeature +from ...image_processing_utils import get_size_dict +from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict +from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_type, + make_list_of_images, + pil_torch_interpolation_mapping, +) +from ...utils import TensorType, logging, requires_backends +from ...utils.import_utils import is_torch_available, is_torchvision_available + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +if is_torchvision_available(): + from torchvision.transforms import Compose, Normalize, PILToTensor, Resize + + +class DepthProImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a DepthPro image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `(size["height"], + size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + _transform_params = [ + "do_resize", + "do_rescale", + "do_normalize", + "size", + "resample", + "antialias", + "rescale_factor", + "image_mean", + "image_std", + "image_type", + ] + + def __init__( + self, + do_resize: bool = True, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 1536, "width": 1536} + size = get_size_dict(size) + self.do_resize = do_resize + self.do_rescale = do_rescale + self.do_normalize = do_normalize + self.size = size + self.resample = resample + self.antialias = antialias + self.rescale_factor = rescale_factor + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + + def _build_transforms( + self, + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + image_type: ImageType, + ) -> "Compose": + """ + Given the input settings build the image transforms using `torchvision.transforms.Compose`. + """ + transforms = [] + + # All PIL and numpy values need to be converted to a torch tensor + # to keep cross compatibility with slow image processors + if image_type == ImageType.PIL: + transforms.append(PILToTensor()) + + elif image_type == ImageType.NUMPY: + transforms.append(NumpyToTensor()) + + # We can combine rescale and normalize into a single operation for speed + if do_rescale and do_normalize: + transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor)) + elif do_rescale: + transforms.append(Rescale(rescale_factor=rescale_factor)) + elif do_normalize: + transforms.append(Normalize(image_mean, image_std)) + + # depth-pro scales the image before resizing it + if do_resize: + transforms.append( + Resize( + (size["height"], size["width"]), + interpolation=pil_torch_interpolation_mapping[resample], + antialias=antialias + ) + ) + + return Compose(transforms) + + @functools.lru_cache(maxsize=1) + def _validate_input_arguments( + self, + return_tensors: Union[str, TensorType], + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + data_format: Union[str, ChannelDimension], + image_type: ImageType, + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample, antialias): + raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + antialias: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ): + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after + resizing. + resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): + `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has + an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use if `do_normalize` is set to `True`. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Only "pt" is supported + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. The following formats are currently supported: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + antialias = antialias if antialias is not None else self.antialias + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + size = size if size is not None else self.size + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + antialias=antialias, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + image_type=image_type, + ) + + transforms = self.get_transforms( + do_resize=do_resize, + do_rescale=do_rescale, + do_normalize=do_normalize, + size=size, + resample=resample, + antialias=antialias, + rescale_factor=rescale_factor, + image_mean=image_mean, + image_std=image_std, + image_type=image_type, + ) + transformed_images = [transforms(image) for image in images] + + data = {"pixel_values": torch.stack(transformed_images, dim=0)} + return BatchFeature(data, tensor_type=return_tensors) + + def post_process_depth_estimation( + self, + predicted_depth, + fov=None, + ) -> List[Dict[str, TensorType]]: + """ + Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. + Only supports PyTorch. + + Args: + outputs ([`DepthEstimatorOutput`]): + Raw outputs of the model. + target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + + Returns: + `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth + predictions. + """ + requires_backends(self, "torch") + + self.size = { + 'width': 3024, + 'height': 2268, + } + W = self.size['width'] + H = self.size['height'] + + if (fov is not None) and (len(predicted_depth) != len(fov)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + output_depths = [] + output_fovs = None if fov is None else [] + fov = [None] * len(predicted_depth) if fov is None else fov + for depth, fov_value in zip(predicted_depth, fov): + + if fov_value is not None: + fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * W / fov_value + + depth = torch.nn.functional.interpolate( + depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False + ).squeeze() + + if fov_value is not None: + depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) + output_fovs.append(fov_value) + + output_depths.append(depth) + + return output_depths, output_fovs From a8dd7049a5e2683a06f8d8df4cb7d22673d35b4b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 10:42:36 +0500 Subject: [PATCH 022/151] use torch interpolation for image processing --- .../depth_pro/image_processing_depth_pro.py | 112 +++++++++++------- 1 file changed, 66 insertions(+), 46 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 883c50ebfe6f..d8b9ff493b1a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,6 +14,7 @@ # limitations under the License. """Image processor class for DepthPro.""" +import functools from typing import Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union @@ -33,7 +34,7 @@ make_list_of_images, to_numpy_array, valid_images, - validate_preprocess_arguments, + pil_torch_interpolation_mapping, ) from ...utils import TensorType, filter_out_non_signature_kwargs, logging @@ -62,7 +63,6 @@ make_list_of_images, to_numpy_array, valid_images, - validate_preprocess_arguments, ) from ...utils import ( TensorType, @@ -99,6 +99,9 @@ class DepthProImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `True`): Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. @@ -123,6 +126,7 @@ def __init__( do_resize: bool = True, size: Optional[Dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -138,15 +142,17 @@ def __init__( self.do_normalize = do_normalize self.size = size self.resample = resample + self.antialias = antialias self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD def resize( self, - image: np.ndarray, + images: List[np.ndarray], size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -155,12 +161,15 @@ def resize( Resize an image to `(size["height"], size["width"])`. Args: - image (`np.ndarray`): - Image to resize. + images (`List[np.ndarray]`): + Images to resize. size (`Dict[str, int]`): Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -175,41 +184,49 @@ def resize( - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. Returns: - `np.ndarray`: The resized image. + `np.ndarray`: The resized images. """ size = get_size_dict(size) if "height" not in size or "width" not in size: raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - # ic(image.dtype) - # ic(type(image)) - # ic(image.shape) - # ic(image.mean()) - # ic(image.std()) - # ic(image.min()) - # ic(image.max()) - # ic(output_size) - # ic(resample) - # ic(data_format) - # ic(input_data_format) - # # exit() - - # return torch.nn.functional.interpolate( - # input=torch.from_numpy(image), - # size=output_size, - # mode=resample, - # align_corners=True, - # ) - - return resize( - image, + images = np.stack(images) + images = torch.from_numpy(images) + + return torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=images, size=output_size, - resample=resample, - data_format=data_format, - input_data_format=input_data_format, - **kwargs, - ) + # mode=pil_torch_interpolation_mapping[resample], + mode="bilinear", + antialias=antialias, + ).numpy() + + def _validate_input_arguments( + self, + do_resize: bool, + size: Dict[str, int], + resample: PILImageResampling, + antialias: bool, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + data_format: Union[str, ChannelDimension], + ): + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample, antialias): + raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") @filter_out_non_signature_kwargs() def preprocess( @@ -218,6 +235,7 @@ def preprocess( do_resize: Optional[bool] = None, size: Dict[str, int] = None, resample: PILImageResampling = None, + antialias: Optional[bool] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, @@ -242,6 +260,9 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -275,6 +296,7 @@ def preprocess( do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample + antialias = antialias if antialias is not None else self.antialias rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std @@ -289,15 +311,17 @@ def preprocess( "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "torch.Tensor, tf.Tensor or jax.ndarray." ) - validate_preprocess_arguments( + self._validate_input_arguments( + do_resize=do_resize, + size=size, + resample=resample, + antialias=antialias, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, image_mean=image_mean, image_std=image_std, - do_resize=do_resize, - size=size, - resample=resample, + data_format=data_format, ) # All transformations expect numpy arrays. @@ -313,15 +337,6 @@ def preprocess( # We assume that all images have the same channel dimension format. input_data_format = infer_channel_dimension_format(images[0]) - # TODO - # depth-pro image preprocessing scales the image before resizing it - - if do_resize: - images = [ - self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format) - for image in images - ] - if do_rescale: images = [ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) @@ -338,6 +353,11 @@ def preprocess( to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] + # depth-pro scales the image before resizing it + # uses torch interpolation which requires ChannelDimension.FIRST + if do_resize: + images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) + data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) From 261bbafe4fb65d3bfe344045d92c7ca67f05283f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 12:12:39 +0500 Subject: [PATCH 023/151] complete post_process_depth_estimation --- .../depth_pro/image_processing_depth_pro.py | 71 +++++++++++-------- .../image_processing_depth_pro_fast.py | 70 ++++++++++-------- 2 files changed, 83 insertions(+), 58 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index d8b9ff493b1a..0a7313e2d19a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,13 +14,13 @@ # limitations under the License. """Image processor class for DepthPro.""" -import functools from typing import Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union import numpy as np from icecream import ic + from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( @@ -186,6 +186,8 @@ def resize( Returns: `np.ndarray`: The resized images. """ + requires_backends(self, "torch") + size = get_size_dict(size) if "height" not in size or "width" not in size: raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") @@ -198,10 +200,9 @@ def resize( # input should be (B, C, H, W) input=images, size=output_size, - # mode=pil_torch_interpolation_mapping[resample], - mode="bilinear", + mode=pil_torch_interpolation_mapping[resample].value, antialias=antialias, - ).numpy() + ) def _validate_input_arguments( self, @@ -357,14 +358,16 @@ def preprocess( # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) + images = images.numpy() data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) def post_process_depth_estimation( self, - predicted_depth, - fov=None, + predicted_depths, + fovs=None, + target_sizes=None, ) -> List[Dict[str, TensorType]]: """ Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. @@ -383,35 +386,45 @@ def post_process_depth_estimation( """ requires_backends(self, "torch") - self.size = { - 'width': 3024, - 'height': 2268, - } - W = self.size['width'] - H = self.size['height'] - - if (fov is not None) and (len(predicted_depth) != len(fov)): + if (fovs is not None) and (len(predicted_depths) != len(fovs)): raise ValueError( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) + if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + + outputs = { + "predicted_depth": [], + "fov": [] if fovs is not None else None + } + + fovs = [None] * len(predicted_depths) if fovs is None else fovs + target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes + + for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - output_depths = [] - output_fovs = None if fov is None else [] - fov = [None] * len(predicted_depth) if fov is None else fov - for depth, fov_value in zip(predicted_depth, fov): + if target_size is not None: - if fov_value is not None: - fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * W / fov_value + # scale image w.r.t fov + if fov is not None: + width = target_size[1] + fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov)) + predicted_depth = predicted_depth * width / fov + outputs["fov"].append(fov) - depth = torch.nn.functional.interpolate( - depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False - ).squeeze() + # interpolate + predicted_depth = self.resize( + predicted_depth.unsqueeze(0).unsqueeze(1), + size=target_size, + resample=self.resample, + antialias=self.antialias + ).squeeze() - if fov_value is not None: - depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) - output_fovs.append(fov_value) + # inverse the depth + predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4) - output_depths.append(depth) + outputs["predicted_depth"].append(predicted_depth) - return output_depths, output_fovs + return outputs diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 8860f2e86830..38d699452e44 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -154,7 +154,7 @@ def _build_transforms( elif do_normalize: transforms.append(Normalize(image_mean, image_std)) - # depth-pro scales the image before resizing it + # depth-pro scales the image before resizing it if do_resize: transforms.append( Resize( @@ -229,9 +229,9 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -308,8 +308,9 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depth, - fov=None, + predicted_depths, + fovs=None, + target_sizes=None, ) -> List[Dict[str, TensorType]]: """ Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. @@ -328,35 +329,46 @@ def post_process_depth_estimation( """ requires_backends(self, "torch") - self.size = { - 'width': 3024, - 'height': 2268, - } - W = self.size['width'] - H = self.size['height'] - - if (fov is not None) and (len(predicted_depth) != len(fov)): + if (fovs is not None) and (len(predicted_depths) != len(fovs)): + raise ValueError( + "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" + ) + if (target_sizes is not None) and (len(predicted_depths) != len(target_sizes)): raise ValueError( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - output_depths = [] - output_fovs = None if fov is None else [] - fov = [None] * len(predicted_depth) if fov is None else fov - for depth, fov_value in zip(predicted_depth, fov): + outputs = { + "predicted_depth": [], + "fov": [] if fovs is not None else None + } + + fovs = [None] * len(predicted_depths) if fovs is None else fovs + target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes + + for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): + + if target_size is not None: - if fov_value is not None: - fov_value = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * W / fov_value + # scale image w.r.t fov + if fov is not None: + width = target_size[1] + fov = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov)) + predicted_depth = predicted_depth * width / fov + outputs["fov"].append(fov) - depth = torch.nn.functional.interpolate( - depth.unsqueeze(0).unsqueeze(1), size=(H, W), mode="bilinear", align_corners=False - ).squeeze() + # interpolate + predicted_depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=predicted_depth.unsqueeze(0).unsqueeze(1), + size=target_size, + mode=pil_torch_interpolation_mapping[self.resample].value, + antialias=self.antialias, + ).squeeze() - if fov_value is not None: - depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4) - output_fovs.append(fov_value) + # inverse the depth + predicted_depth = 1.0 / torch.clamp(predicted_depth, min=1e-4, max=1e4) - output_depths.append(depth) + outputs["predicted_depth"].append(predicted_depth) - return output_depths, output_fovs + return outputs From a4b3556c5f7ef738048df1b7de22dfa45c822b43 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 16:36:19 +0500 Subject: [PATCH 024/151] config: fix imports and sort args --- .../depth_pro/configuration_depth_pro.py | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index f124d3e5b71a..fae3e84432be 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -14,15 +14,8 @@ # limitations under the License. """DepthPro model configuration""" -from collections import OrderedDict -from typing import Mapping - -from packaging import version - -from transformers.configuration_utils import PretrainedConfig -from transformers.onnx import OnnxConfig -from transformers.utils import logging -from transformers.utils.backbone_utils import get_aligned_output_features_output_indices +from ...configuration_utils import PretrainedConfig +from ...utils import logging logger = logging.get_logger(__name__) @@ -41,6 +34,8 @@ class DepthProConfig(PretrainedConfig): Args: hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. + fusion_hidden_size + TODO num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): @@ -65,6 +60,8 @@ class DepthProConfig(PretrainedConfig): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. + patch_embeddings_size + TODO qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): @@ -73,22 +70,28 @@ class DepthProConfig(PretrainedConfig): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): Whether to use the SwiGLU feedforward neural network. - out_features (`List[str]`, *optional*): - If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. - (depending on how many stages the model has). If unset and `out_indices` is set, will default to the - corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. - out_indices (`List[int]`, *optional*): - If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how - many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. - If unset and `out_features` is unset, will default to the last stage. Must be in the - same order as defined in the `stage_names` attribute. apply_layernorm (`bool`, *optional*, defaults to `True`): Whether to apply layer normalization to the feature maps in case the model is used as backbone. reshape_hidden_states (`bool`, *optional*, defaults to `True`): Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. + intermediate_hook_ids + TODO + intermediate_feature_dims + TODO + scaled_images_ratios + TODO + scaled_images_overlap_ratios + TODO + scaled_images_feature_dims + TODO + use_batch_norm_in_fusion + TODO + use_fov_model + TODO + num_fov_head_layers + TODO Example: @@ -127,8 +130,6 @@ def __init__( layerscale_value=1.0, drop_path_rate=0.0, use_swiglu_ffn=False, - out_features=None, - out_indices=None, apply_layernorm=True, reshape_hidden_states=True, intermediate_hook_ids = [11, 5], @@ -137,7 +138,7 @@ def __init__( scaled_images_overlap_ratios = [0.0, 0.5, 0.25], scaled_images_feature_dims = [1024, 1024, 512], use_batch_norm_in_fusion=False, - use_fov_model=False, + use_fov_model=True, num_fov_head_layers=2, **kwargs, ): @@ -161,10 +162,6 @@ def __init__( self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate self.use_swiglu_ffn = use_swiglu_ffn - self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)] - self._out_features, self._out_indices = get_aligned_output_features_output_indices( - out_features=out_features, out_indices=out_indices, stage_names=self.stage_names - ) self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_fusion = use_batch_norm_in_fusion From f13c63208caec6b70a9d8660a42d92ec4c18af3a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 26 Nov 2024 16:51:12 +0500 Subject: [PATCH 025/151] apply inference in weight conversion --- .../depth_pro/convert_depth_pro_to_hf.py | 63 ++++++++++++------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py index de7bf395a355..7b4552c508ff 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py @@ -18,24 +18,22 @@ """ import argparse -import json from pathlib import Path import re import requests import torch -import torch.nn as nn from huggingface_hub import hf_hub_download from PIL import Image -from torchvision import transforms -from transformers import BitImageProcessor, Dinov2Config, Dinov2ForImageClassification, Dinov2Model -from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, PILImageResampling +from transformers.image_utils import PILImageResampling from transformers.utils import logging +# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation # TODO: import directly from transformers from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation +from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast logging.set_verbosity_info() @@ -147,13 +145,21 @@ def inference_test(processor, model): predicted_depth = outputs.predicted_depth fov = outputs.fov + target_sizes = [[image.height, image.width]] * len(predicted_depth) - predicted_depth, fov = processor.post_process_depth_estimation(predicted_depth, fov) + outputs = processor.post_process_depth_estimation( + predicted_depths=predicted_depth, + fovs=fov, + target_sizes=target_sizes, + ) + predicted_depth = outputs['predicted_depth'] + fov = outputs['fov'] - print("predicted_depth.shape:", predicted_depth.shape) - print("fov.shape:", fov.shape) + print("\nInference ...") + print("predicted_depth:", predicted_depth) + print("predicted_depth[0].shape:", predicted_depth[0].shape) print("fov:", fov) - print("Inference was Successfull!") + print("Inference was Successfull!\n") @torch.no_grad() @@ -167,6 +173,7 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu # load original weights from huggingface hub file_path = hf_hub_download(repo_id, filename) + # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" state_dict = torch.load(file_path, weights_only=True) # enumerate fusion layers @@ -235,23 +242,31 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu model = DepthProForDepthEstimation(config, use_fov_model=True).eval() model.load_state_dict(state_dict) - # TODO - processor = ... - # inference_test(processor, model) + processor = DepthProImageProcessorFast( + do_resize = True, + size = {"height": 1536, "width": 1536}, + resample = PILImageResampling.BILINEAR, + antialias = False, + do_rescale = True, + rescale_factor = 1 / 255, + do_normalize = True, + image_mean = 0.5, + image_std = 0.5, + return_tensors = "pt", + ) + inference_test(processor, model) if pytorch_dump_folder_path is not None: Path(pytorch_dump_folder_path).mkdir(exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) - # TODO - # print(f"Saving image processor to {pytorch_dump_folder_path}") - # processor.save_pretrained(pytorch_dump_folder_path) - + print(f"Saving image processor to {pytorch_dump_folder_path}") + processor.save_pretrained(pytorch_dump_folder_path) - # TODO - # if push_to_hub: - # model.push_to_hub("...") - # processor.push_to_hub("...") + if push_to_hub: + hub_path = "geetu040/DepthPro" + model.push_to_hub(hub_path) + processor.push_to_hub(hub_path) """ @@ -260,8 +275,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu convert_depth_pro_checkpoint( "apple/DepthPro", "depth_pro.pt", - "my_local_dump", - False, + "my_local_depth_pro_dump", + True, ) ``` @@ -270,8 +285,8 @@ def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, pu python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ --repo_id "apple/DepthPro" \ --filename "depth_pro.pt" \ - --pytorch_dump_folder_path "my_local_dump" \ - --push_to_hub 0 + --pytorch_dump_folder_path "my_local_depth_pro_dump" \ + --push_to_hub ``` """ From 387ddd8c7e50f419d1abcd5a61cd48ea23e0d626 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 10:55:18 +0500 Subject: [PATCH 026/151] use mllama script instead for weight conversion --- .../depth_pro/convert_depth_pro_to_hf.py | 317 ------------------ .../convert_depth_pro_weights_to_hf.py | 255 ++++++++++++++ 2 files changed, 255 insertions(+), 317 deletions(-) delete mode 100644 src/transformers/models/depth_pro/convert_depth_pro_to_hf.py create mode 100644 src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py diff --git a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py deleted file mode 100644 index 7b4552c508ff..000000000000 --- a/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py +++ /dev/null @@ -1,317 +0,0 @@ -# coding=utf-8 -# Copyright 2023 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert DepthPro checkpoints from the original repository. - -URL: https://huggingface.co/apple/DepthPro/tree/main -""" - -import argparse -from pathlib import Path -import re - -import requests -import torch -from huggingface_hub import hf_hub_download -from PIL import Image - -from transformers.image_utils import PILImageResampling -from transformers.utils import logging - -# from transformers import DepthProConfig, DepthProImageProcessorFast, DepthProForDepthEstimation -# TODO: import directly from transformers -from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig -from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation -from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast - - -logging.set_verbosity_info() -logger = logging.get_logger(__name__) - - -def create_vit_rename_keys(config): - rename_keys = [] - # fmt: off - - # patch embedding layer - rename_keys.append(("cls_token", "embeddings.cls_token")) - rename_keys.append(("pos_embed", "embeddings.position_embeddings")) - rename_keys.append(("patch_embed.proj.weight", "embeddings.patch_embeddings.projection.weight")) - rename_keys.append(("patch_embed.proj.bias", "embeddings.patch_embeddings.projection.bias")) - - for i in range(config.num_hidden_layers): - # layernorms - rename_keys.append((f"blocks.{i}.norm1.weight", f"encoder.layer.{i}.norm1.weight")) - rename_keys.append((f"blocks.{i}.norm1.bias", f"encoder.layer.{i}.norm1.bias")) - rename_keys.append((f"blocks.{i}.norm2.weight", f"encoder.layer.{i}.norm2.weight")) - rename_keys.append((f"blocks.{i}.norm2.bias", f"encoder.layer.{i}.norm2.bias")) - # MLP - if config.use_swiglu_ffn: - rename_keys.append((f"blocks.{i}.mlp.w12.weight", f"encoder.layer.{i}.mlp.w12.weight")) - rename_keys.append((f"blocks.{i}.mlp.w12.bias", f"encoder.layer.{i}.mlp.w12.bias")) - rename_keys.append((f"blocks.{i}.mlp.w3.weight", f"encoder.layer.{i}.mlp.w3.weight")) - rename_keys.append((f"blocks.{i}.mlp.w3.bias", f"encoder.layer.{i}.mlp.w3.bias")) - else: - rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"encoder.layer.{i}.mlp.fc1.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"encoder.layer.{i}.mlp.fc1.bias")) - rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"encoder.layer.{i}.mlp.fc2.weight")) - rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"encoder.layer.{i}.mlp.fc2.bias")) - # layerscale - rename_keys.append((f"blocks.{i}.ls1.gamma", f"encoder.layer.{i}.layer_scale1.lambda1")) - rename_keys.append((f"blocks.{i}.ls2.gamma", f"encoder.layer.{i}.layer_scale2.lambda1")) - # attention projection layer - rename_keys.append((f"blocks.{i}.attn.proj.weight", f"encoder.layer.{i}.attention.output.dense.weight")) - rename_keys.append((f"blocks.{i}.attn.proj.bias", f"encoder.layer.{i}.attention.output.dense.bias")) - - # final layernorm - rename_keys.append(("norm.weight", "layernorm.weight")) - rename_keys.append(("norm.bias", "layernorm.bias")) - - # fmt: on - return rename_keys - - -# we split up the matrix of each encoder layer into queries, keys and values -def read_in_q_k_v(state_dict, config): - state_dict_keys = state_dict.keys() - for key in list(state_dict_keys): - if "qkv" in key: - in_proj = state_dict.pop(key) - q, k, v = torch.split(in_proj, config.hidden_size, dim=0) - - if "fov" in key: - key = key.replace('fov.encoder.0', 'fov_model.encoder') - else: - key = "depth_pro." + key - - key = key.replace("blocks", "encoder.layer") - state_dict[key.replace("attn.qkv", "attention.attention.query")] = q - state_dict[key.replace("attn.qkv", "attention.attention.key")] = k - state_dict[key.replace("attn.qkv", "attention.attention.value")] = v - return state_dict - - -# hard coded upsample keys -def update_hard_coded_keys(state_dict): - mapping = [ - # upsamples - ('encoder.upsample_latent0.0.weight', 'depth_pro.encoder.upsample_intermediate.1.proj.weight'), - ('encoder.upsample_latent0.1.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight'), - ('encoder.upsample_latent0.2.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight'), - ('encoder.upsample_latent0.3.weight', 'depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight'), - ('encoder.upsample_latent1.0.weight', 'depth_pro.encoder.upsample_intermediate.0.proj.weight'), - ('encoder.upsample_latent1.1.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight'), - ('encoder.upsample_latent1.2.weight', 'depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight'), - ('encoder.upsample0.0.weight', 'depth_pro.encoder.upsample_scaled_images.2.proj.weight'), - ('encoder.upsample0.1.weight', 'depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight'), - ('encoder.upsample1.0.weight', 'depth_pro.encoder.upsample_scaled_images.1.proj.weight'), - ('encoder.upsample1.1.weight', 'depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight'), - ('encoder.upsample2.0.weight', 'depth_pro.encoder.upsample_scaled_images.0.proj.weight'), - ('encoder.upsample2.1.weight', 'depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight'), - ('encoder.upsample_lowres.weight', 'depth_pro.encoder.upsample_image.upsample_blocks.0.weight'), - ('encoder.upsample_lowres.bias', 'depth_pro.encoder.upsample_image.upsample_blocks.0.bias'), - - # neck - ("fov.downsample.0.weight", "fov_model.global_neck.0.weight"), - ("fov.downsample.0.bias", "fov_model.global_neck.0.bias"), - ("fov.encoder.1.weight", "fov_model.encoder_neck.weight"), - ("fov.encoder.1.bias", "fov_model.encoder_neck.bias"), - ] - for src, dest in mapping: - state_dict[dest] = state_dict.pop(src) - - return state_dict - - -# We will verify our results on an image of cute cats -def inference_test(processor, model): - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - image = Image.open(requests.get(url, stream=True).raw).convert("RGB") - - inputs = processor(image) - with torch.no_grad(): - outputs = model(**inputs) - - predicted_depth = outputs.predicted_depth - fov = outputs.fov - target_sizes = [[image.height, image.width]] * len(predicted_depth) - - outputs = processor.post_process_depth_estimation( - predicted_depths=predicted_depth, - fovs=fov, - target_sizes=target_sizes, - ) - predicted_depth = outputs['predicted_depth'] - fov = outputs['fov'] - - print("\nInference ...") - print("predicted_depth:", predicted_depth) - print("predicted_depth[0].shape:", predicted_depth[0].shape) - print("fov:", fov) - print("Inference was Successfull!\n") - - -@torch.no_grad() -def convert_depth_pro_checkpoint(repo_id, filename, pytorch_dump_folder_path, push_to_hub=False): - """ - Copy/paste/tweak model's weights to our DepthPro structure. - """ - - # define default DepthPro configuration - config = DepthProConfig(use_fov_model=True) - - # load original weights from huggingface hub - file_path = hf_hub_download(repo_id, filename) - # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" - state_dict = torch.load(file_path, weights_only=True) - - # enumerate fusion layers - n_scaled_images = len(config.scaled_images_ratios) # 3 - n_intermediate_hooks = len(config.intermediate_hook_ids) # 2 - n_fusion_layers = n_scaled_images + n_intermediate_hooks # 5 - - # 1. keys for vit encoders - vit_rename_keys = create_vit_rename_keys(config) - for src_prefix, dest_prefix in [ - ("encoder.patch_encoder", "depth_pro.encoder.patch_encoder"), - ("encoder.image_encoder", "depth_pro.encoder.image_encoder"), - ("fov.encoder.0", "fov_model.encoder"), - ]: - for src, dest in vit_rename_keys: - src = src_prefix + "." + src - dest = dest_prefix + "." + dest - state_dict[dest] = state_dict.pop(src) - - # 2. qkv keys for vit encoders - state_dict = read_in_q_k_v(state_dict, config) - - # 3. hard coded mapping - state_dict = update_hard_coded_keys(state_dict) - - - for key in list(state_dict.keys()): - - # 4. final depth estimation head - if key.startswith("head."): - new_key = "head." + key - - # 5. fov model head - elif key.startswith("fov.head."): - new_key = key.replace("fov", 'fov_model') - - # 6. projections between encoder and fusion - elif "decoder.convs." in key: - n = re.findall(r'\d+', key)[0] # find digit inside string - n = n_fusion_layers - int(n) - 1 - new_key = f"projections.{n}.weight" - - # 7. fuse low res with image features - elif "encoder.fuse_lowres." in key: - new_key = key.replace("encoder.fuse_lowres", "depth_pro.encoder.fuse_image_with_low_res") - - # 8. fusion stage (decoder) - elif key.startswith("decoder.fusions."): - new_key = key.replace("decoder.fusions.", "fusion_stage.layers.") - new_key = new_key.replace("resnet1", "residual_layer1") - new_key = new_key.replace("resnet2", "residual_layer2") - new_key = new_key.replace("residual.1", "convolution1") - new_key = new_key.replace("residual.3", "convolution2") - new_key = new_key.replace("out_conv", "projection") - - n_with_dots = re.findall(r'.\d+.', new_key)[0] # find digit inside string followed by . - n = n_with_dots[1:-1] - n = n_fusion_layers - int(n) - 1 - new_key = new_key.replace(n_with_dots, f".{n}.") - - else: - continue - - state_dict[new_key] = state_dict.pop(key) - - model = DepthProForDepthEstimation(config, use_fov_model=True).eval() - model.load_state_dict(state_dict) - - processor = DepthProImageProcessorFast( - do_resize = True, - size = {"height": 1536, "width": 1536}, - resample = PILImageResampling.BILINEAR, - antialias = False, - do_rescale = True, - rescale_factor = 1 / 255, - do_normalize = True, - image_mean = 0.5, - image_std = 0.5, - return_tensors = "pt", - ) - inference_test(processor, model) - - if pytorch_dump_folder_path is not None: - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - print(f"Saving image processor to {pytorch_dump_folder_path}") - processor.save_pretrained(pytorch_dump_folder_path) - - if push_to_hub: - hub_path = "geetu040/DepthPro" - model.push_to_hub(hub_path) - processor.push_to_hub(hub_path) - - -""" -- create files locally using function -```py -convert_depth_pro_checkpoint( - "apple/DepthPro", - "depth_pro.pt", - "my_local_depth_pro_dump", - True, -) -``` - -- create files locally using command line args -```cmd -python transformers/src/transformers/models/depth_pro/convert_depth_pro_to_hf.py \ - --repo_id "apple/DepthPro" \ - --filename "depth_pro.pt" \ - --pytorch_dump_folder_path "my_local_depth_pro_dump" \ - --push_to_hub -``` -""" - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Required parameters - parser.add_argument( - "--repo_id", default="apple/DepthPro", type=str, help="Name of the repo from huggingface you'd like to convert." - ) - parser.add_argument( - "--filename", default="depth_pro.pt", type=str, help="Name of the file from repo you'd like to convert." - ) - parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." - ) - parser.add_argument( - "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." - ) - - args = parser.parse_args() - convert_depth_pro_checkpoint( - args.repo_id, - args.filename, - args.pytorch_dump_folder_path, - args.push_to_hub, - ) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py new file mode 100644 index 000000000000..fe862d7469a1 --- /dev/null +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -0,0 +1,255 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import gc +import os + +import regex as re +import torch +from huggingface_hub import hf_hub_download +from transformers.image_utils import PILImageResampling + +from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig +from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast +from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation + + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + + # patch_encoder/image_encoder (ViT based) + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", + + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + r"decoder.convs.(\d+).weight": lambda match: ( + f"projections.{4-int(match.group(1))}.weight" + ), + + # fov_model.encoder (ViT based) + r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", + + # fov head + r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", + r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", + r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", + + # fusion stage + r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" + ), + r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}" + ), + r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( + f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}" + ), + + # qkv attentions blocks + + # upsamples (hard coded; regex is not very feasible here) + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", + "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", +} +# fmt: on + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # an empty line + continue + new_text = re.sub(pattern, replacement, new_text) + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + return output_dict + +def get_qkv_state_dict(key, parameter): + qkv_state_dict = {} + placeholder = re.search(r'(\(.*?\))', key).group(1) + replacements_keys = placeholder[1:-1].split("|") + replacements_vals = torch.split( + parameter, + split_size_or_sections=parameter.size(0)//len(replacements_keys), + dim=0 + ) + for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): + qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val + return qkv_state_dict + +def write_model( + hf_repo_id: str, + output_dir: str, + safe_serialization: bool=True, +): + os.makedirs(output_dir, exist_ok=True) + + # ------------------------------------------------------------ + # Create and save config + # ------------------------------------------------------------ + + # create config + config = DepthProConfig( + # this config is same as the default config and used for pre-trained weights + hidden_size=1024, + fusion_hidden_size=256, + num_hidden_layers=24, + num_attention_heads=16, + mlp_ratio=4, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-6, + image_size=1536, + patch_size=384, + num_channels=3, + patch_embeddings_size=16, + qkv_bias=True, + layerscale_value=1.0, + drop_path_rate=0.0, + use_swiglu_ffn=False, + apply_layernorm=True, + reshape_hidden_states=True, + intermediate_hook_ids = [11, 5], + intermediate_feature_dims = [256, 256], + scaled_images_ratios = [0.25, 0.5, 1], + scaled_images_overlap_ratios = [0.0, 0.5, 0.25], + scaled_images_feature_dims = [1024, 1024, 512], + use_batch_norm_in_fusion=False, + use_fov_model=True, + num_fov_head_layers=2, + ) + + # save config + config.save_pretrained(output_dir) + print("Model config saved successfully...") + + # ------------------------------------------------------------ + # Convert weights + # ------------------------------------------------------------ + + # downlaod and load state_dict from hf repo + file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") + # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally + loaded = torch.load(file_path, weights_only=True) + + print("Converting model...") + all_keys = list(loaded.keys()) + new_keys = convert_old_keys_to_new_keys(all_keys) + + state_dict = {} + for key in all_keys: + new_key = new_keys[key] + current_parameter = loaded.pop(key) + + if "qkv" in key: + qkv_state_dict = get_qkv_state_dict(new_key, current_parameter) + state_dict.update(qkv_state_dict) + else: + state_dict[new_key] = current_parameter + + print("Loading the checkpoint in a DepthPro model.") + model = DepthProForDepthEstimation(config) + model.load_state_dict(state_dict, strict=True, assign=True) + print("Checkpoint loaded successfully.") + + print("Saving the model.") + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + del state_dict, model + + # Safety check: reload the converted model + gc.collect() + print("Reloading the model to check if it's saved correctly.") + DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") + print("Model reloaded successfully.") + +def write_image_processor(output_dir: str): + image_processor = DepthProImageProcessorFast( + do_resize = True, + size = {"height": 1536, "width": 1536}, + resample = PILImageResampling.BILINEAR, + antialias = False, + do_rescale = True, + rescale_factor = 1 / 255, + do_normalize = True, + image_mean = 0.5, + image_std = 0.5, + return_tensors = "pt", + ) + image_processor.save_pretrained(output_dir) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--hf_repo_id", + default="apple/DepthPro", + help="Location of official weights from apple on HF", + ) + parser.add_argument( + "--output_dir", + default="apple_DepthPro", + help="Location to write HF model and processor", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + args = parser.parse_args() + + write_model( + hf_repo_id=args.hf_repo_id, + output_dir=args.output_dir, + safe_serialization=args.safe_serialization, + ) + + write_image_processor( + output_dir=args.output_dir, + ) + + +if __name__ == "__main__": + main() From 9b67f9d2afc1b081a4990149eb16ea906ce09295 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 11:09:50 +0500 Subject: [PATCH 027/151] clean weight conversion script --- .../convert_depth_pro_weights_to_hf.py | 106 +++++++++--------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index fe862d7469a1..0b81e8907e29 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -29,39 +29,55 @@ # fmt: off ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # patch_encoder/image_encoder (ViT based) - r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", - r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", - r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", - r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", - r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", - r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", - - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", - r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + # encoder and head + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"head.(\d+).(weight|bias)": r"head.head.\1.\2", + + # fov + r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", + r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", + r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", + r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", + + # upsamples (hard coded; regex is not very feasible here) + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", + "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", + + # projections between encoder and fusion r"decoder.convs.(\d+).weight": lambda match: ( f"projections.{4-int(match.group(1))}.weight" ), - # fov_model.encoder (ViT based) - r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", - r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", - r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", - r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", - r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", - r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", - r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", - r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", - r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", - - # fov head - r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", - r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", - r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", - # fusion stage r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" @@ -72,25 +88,6 @@ r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}" ), - - # qkv attentions blocks - - # upsamples (hard coded; regex is not very feasible here) - "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", - "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", - "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", - "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", - "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", - "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", - "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", - "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", - "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", - "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", - "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", - "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", - "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", - "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", - "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", } # fmt: on @@ -108,9 +105,18 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None): return output_dict def get_qkv_state_dict(key, parameter): + """ + new key which looks like this + xxxx.(q|k|v).xxx (m, n) + + is converted to + xxxx.q.xxxx (m//3, n) + xxxx.k.xxxx (m//3, n) + xxxx.v.xxxx (m//3, n) + """ qkv_state_dict = {} - placeholder = re.search(r'(\(.*?\))', key).group(1) - replacements_keys = placeholder[1:-1].split("|") + placeholder = re.search(r'(\(.*?\))', key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] replacements_vals = torch.split( parameter, split_size_or_sections=parameter.size(0)//len(replacements_keys), From 617c872fb90d313f03fc55962088127e659241c7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 12:57:50 +0500 Subject: [PATCH 028/151] add depth-pro status in other files --- src/transformers/__init__.py | 16 +++++ .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + src/transformers/models/depth_pro/__init__.py | 72 +++++++++++++++++++ .../convert_depth_pro_weights_to_hf.py | 8 ++- .../depth_pro/image_processing_depth_pro.py | 2 - utils/check_docstrings.py | 1 + utils/check_repo.py | 1 + 9 files changed, 101 insertions(+), 5 deletions(-) create mode 100644 src/transformers/models/depth_pro/__init__.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 47b43e0b9089..3d0b85e3a1b4 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -408,6 +408,7 @@ "DPRReaderTokenizer", ], "models.dpt": ["DPTConfig"], + "models.depth_pro": ["DepthProConfig"], "models.efficientnet": ["EfficientNetConfig"], "models.electra": [ "ElectraConfig", @@ -1195,6 +1196,7 @@ _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) + _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) @@ -2136,6 +2138,13 @@ "DPTPreTrainedModel", ] ) + _import_structure["models.depth_pro"].extend( + [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + ) _import_structure["models.efficientnet"].extend( [ "EfficientNetForImageClassification", @@ -5272,6 +5281,7 @@ DPRReaderTokenizer, ) from .models.dpt import DPTConfig + from .models.depth_pro import DepthProConfig from .models.efficientnet import ( EfficientNetConfig, ) @@ -6100,6 +6110,7 @@ from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor + from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.efficientnet import EfficientNetImageProcessor from .models.flava import ( FlavaFeatureExtractor, @@ -6907,6 +6918,11 @@ DPTModel, DPTPreTrainedModel, ) + from .models.depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) from .models.efficientnet import ( EfficientNetForImageClassification, EfficientNetModel, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 48625ea3f346..d8860d38f850 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -93,6 +93,7 @@ ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), + ("depth_pro", "DepthProConfig"), ("efficientformer", "EfficientFormerConfig"), ("efficientnet", "EfficientNetConfig"), ("electra", "ElectraConfig"), @@ -394,6 +395,7 @@ ("donut-swin", "DonutSwin"), ("dpr", "DPR"), ("dpt", "DPT"), + ("depth_pro", "DepthPro"), ("efficientformer", "EfficientFormer"), ("efficientnet", "EfficientNet"), ("electra", "ELECTRA"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index a8960d80acc8..e7b53f30a7a0 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -77,6 +77,7 @@ ("dinov2", ("BitImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)), ("dpt", ("DPTImageProcessor",)), + ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor",)), ("efficientnet", ("EfficientNetImageProcessor",)), ("flava", ("FlavaImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 67c539fca664..4cc15ca4ca51 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -92,6 +92,7 @@ ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), + ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("electra", "ElectraModel"), @@ -571,6 +572,7 @@ ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dpt", "DPTModel"), + ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), @@ -866,6 +868,7 @@ # Model for depth estimation mapping ("depth_anything", "DepthAnythingForDepthEstimation"), ("dpt", "DPTForDepthEstimation"), + ("depth_pro", "DepthProForDepthEstimation"), ("glpn", "GLPNForDepthEstimation"), ("zoedepth", "ZoeDepthForDepthEstimation"), ] diff --git a/src/transformers/models/depth_pro/__init__.py b/src/transformers/models/depth_pro/__init__.py new file mode 100644 index 000000000000..1f2a6646c5c0 --- /dev/null +++ b/src/transformers/models/depth_pro/__init__.py @@ -0,0 +1,72 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...file_utils import _LazyModule, is_torch_available, is_vision_available +from ...utils import OptionalDependencyNotAvailable + + +_import_structure = {"configuration_depth_pro": ["DepthProConfig"]} + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_depth_pro"] = ["DepthProImageProcessor"] + _import_structure["image_processing_depth_pro_fast"] = ["DepthProImageProcessorFast"] + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_depth_pro"] = [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_depth_pro import DepthProConfig + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_depth_pro import DepthProImageProcessor + from .image_processing_depth_pro_fast import DepthProImageProcessorFast + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 0b81e8907e29..741016e88a3d 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -21,9 +21,11 @@ from huggingface_hub import hf_hub_download from transformers.image_utils import PILImageResampling -from transformers.models.depth_pro.configuration_depth_pro import DepthProConfig -from transformers.models.depth_pro.image_processing_depth_pro_fast import DepthProImageProcessorFast -from transformers.models.depth_pro.modeling_depth_pro import DepthProForDepthEstimation +from transformers import ( + DepthProConfig, + DepthProImageProcessorFast, + DepthProForDepthEstimation, +) # fmt: off diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 0a7313e2d19a..99a7c26c9826 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -72,8 +72,6 @@ requires_backends, ) -from transformers.models.depth_pro.modeling_depth_pro import DepthProDepthEstimatorOutput - if is_torch_available(): import torch diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 0be960f4a33e..34deed0df47e 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -140,6 +140,7 @@ "DPRReaderTokenizer", "DPRReaderTokenizerFast", "DPTModel", + "DepthProModel", "Data2VecAudioConfig", "Data2VecTextConfig", "Data2VecTextModel", diff --git a/utils/check_repo.py b/utils/check_repo.py index 10be5cdcd262..2e131e879153 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -213,6 +213,7 @@ "JukeboxPrior", "SamModel", "DPTForDepthEstimation", + "DepthProForDepthEstimation", "DecisionTransformerGPT2Model", "GLPNForDepthEstimation", "ViltForImagesAndTextClassification", From 6e1c512b15474979ea3176e85214ccc70fcc6cd7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 13:33:25 +0500 Subject: [PATCH 029/151] fill docstring in config --- .../depth_pro/configuration_depth_pro.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index fae3e84432be..9b53288c41ed 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -34,8 +34,8 @@ class DepthProConfig(PretrainedConfig): Args: hidden_size (`int`, *optional*, defaults to 1024): Dimensionality of the encoder layers and the pooler layer. - fusion_hidden_size - TODO + fusion_hidden_size (`int`, *optional*, defaults to 256): + The number of channels before fusion. num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): @@ -53,15 +53,17 @@ class DepthProConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the layer normalization layers. - image_size (`int`, *optional*, defaults to 224): - TODO: image_size / 2**n_fusion_blocks = patch_size / patch_embeddings_size - The size (resolution) of each image. + image_size (`int`, *optional*, defaults to 1536): + The size (resolution) of each image, + To generate depth of same size as image, + image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size + where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. - patch_embeddings_size - TODO + patch_embeddings_size (`int`, *optional*, defaults to 16): + kernel_size and stride for convolution in PatchEmbeddings. qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): @@ -77,21 +79,21 @@ class DepthProConfig(PretrainedConfig): case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. intermediate_hook_ids - TODO + Indices of the intermediate hidden states from patch_encoder to use for fusion. intermediate_feature_dims - TODO + Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids. scaled_images_ratios - TODO + Use images of these ratios for patch_encoder. scaled_images_overlap_ratios - TODO + Overlap ratio between patches for each scaled image in scaled_image_ratios. scaled_images_feature_dims - TODO + Hidden state during upsampling for each scaled image in scaled_images_ratios. use_batch_norm_in_fusion - TODO + Whether to use batch normalization in the residual units of the fusion blocks. use_fov_model - TODO + Whether to use `DepthProFOVModel` to generate Field of View. num_fov_head_layers - TODO + No of convolution layers in head of `DepthProFOVModel`. Example: From 12ee607e5d319a488d7e807a75927cb86f463cec Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 18:47:53 +0500 Subject: [PATCH 030/151] formatting --- .../depth_pro/configuration_depth_pro.py | 2 +- .../convert_depth_pro_weights_to_hf.py | 28 ++++----- .../depth_pro/image_processing_depth_pro.py | 48 +++++++++------ .../image_processing_depth_pro_fast.py | 40 ++++++++----- .../models/depth_pro/modeling_depth_pro.py | 58 ++++++------------- 5 files changed, 88 insertions(+), 88 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 9b53288c41ed..8bab8227be7e 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -67,7 +67,7 @@ class DepthProConfig(PretrainedConfig): qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries, keys and values. layerscale_value (`float`, *optional*, defaults to 1.0): - Initial value to use for layer scale. + Initial value to use for layer scale. drop_path_rate (`float`, *optional*, defaults to 0.0): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 741016e88a3d..c3b77f17f04c 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -41,7 +41,7 @@ r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", r"head.(\d+).(weight|bias)": r"head.head.\1.\2", # fov @@ -59,19 +59,19 @@ r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", # upsamples (hard coded; regex is not very feasible here) - "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", - "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", - "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", - "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", - "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", - "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", - "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", - "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", - "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", - "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", - "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", - "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", - "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", + "encoder.upsample_latent0.0.weight": "depth_pro.encoder.upsample_intermediate.1.proj.weight", + "encoder.upsample_latent0.1.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.0.weight", + "encoder.upsample_latent0.2.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.1.weight", + "encoder.upsample_latent0.3.weight": "depth_pro.encoder.upsample_intermediate.1.upsample_blocks.2.weight", + "encoder.upsample_latent1.0.weight": "depth_pro.encoder.upsample_intermediate.0.proj.weight", + "encoder.upsample_latent1.1.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.0.weight", + "encoder.upsample_latent1.2.weight": "depth_pro.encoder.upsample_intermediate.0.upsample_blocks.1.weight", + "encoder.upsample0.0.weight": "depth_pro.encoder.upsample_scaled_images.2.proj.weight", + "encoder.upsample0.1.weight": "depth_pro.encoder.upsample_scaled_images.2.upsample_blocks.0.weight", + "encoder.upsample1.0.weight": "depth_pro.encoder.upsample_scaled_images.1.proj.weight", + "encoder.upsample1.1.weight": "depth_pro.encoder.upsample_scaled_images.1.upsample_blocks.0.weight", + "encoder.upsample2.0.weight": "depth_pro.encoder.upsample_scaled_images.0.proj.weight", + "encoder.upsample2.1.weight": "depth_pro.encoder.upsample_scaled_images.0.upsample_blocks.0.weight", "encoder.upsample_lowres.weight": "depth_pro.encoder.upsample_image.upsample_blocks.0.weight", "encoder.upsample_lowres.bias": "depth_pro.encoder.upsample_image.upsample_blocks.0.bias", diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 99a7c26c9826..0e3c7d6455b0 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -166,8 +166,8 @@ def resize( resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -260,8 +260,8 @@ def preprocess( `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -352,7 +352,7 @@ def preprocess( to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images ] - # depth-pro scales the image before resizing it + # depth-pro scales the image before resizing it # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) @@ -363,24 +363,36 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depths, - fovs=None, - target_sizes=None, - ) -> List[Dict[str, TensorType]]: + predicted_depths: Union[TensorType, List[TensorType]], + fovs: Optional[Union[TensorType, List[TensorType], None]] = None, + target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: """ - Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. - Only supports PyTorch. + Post-processes the raw depth predictions from the model to generate final depth predictions and optionally + resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) + and adjusts depth values accordingly. Args: - outputs ([`DepthEstimatorOutput`]): - Raw outputs of the model. - target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. + predicted_depths (`Union[TensorType, List[TensorType]]`): + Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each + corresponding to an image in the batch. + fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`): + Field of view (FoV) values corresponding to each depth prediction. Should have the same length + as `predicted_depths` if provided. If `None`, FoV scaling is skipped. + target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. Returns: - `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + `Dict[str, List[TensorType]]`: + A dictionary containing: + - `"predicted_depth"`: A list of processed depth tensors. + - `"fov"`: A list of processed FoV values if provided, otherwise `None`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. """ requires_backends(self, "torch") diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 38d699452e44..3af05df3ccb8 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -15,7 +15,7 @@ """Fast Image processor class for DepthPro.""" import functools -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Tuple from ...image_processing_base import BatchFeature from ...image_processing_utils import get_size_dict @@ -308,24 +308,36 @@ def preprocess( def post_process_depth_estimation( self, - predicted_depths, - fovs=None, - target_sizes=None, - ) -> List[Dict[str, TensorType]]: + predicted_depths: Union[TensorType, List[TensorType]], + fovs: Optional[Union[TensorType, List[TensorType], None]] = None, + target_sizes: Optional[Union[TensorType, List[tuple[int, int]], None]] = None, + ) -> Dict[str, List[TensorType]]: """ - Converts the raw output of [`DepthEstimatorOutput`] into final depth predictions and depth PIL images. - Only supports PyTorch. + Post-processes the raw depth predictions from the model to generate final depth predictions and optionally + resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) + and adjusts depth values accordingly. Args: - outputs ([`DepthEstimatorOutput`]): - Raw outputs of the model. - target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*): - Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size - (height, width) of each image in the batch. If left to None, predictions will not be resized. + predicted_depths (`Union[TensorType, List[TensorType]]`): + Raw depth predictions output by the model. Can be a single tensor or a list of tensors, each + corresponding to an image in the batch. + fovs (`Optional[Union[TensorType, List[TensorType], None]]`, *optional*, defaults to `None`): + Field of view (FoV) values corresponding to each depth prediction. Should have the same length + as `predicted_depths` if provided. If `None`, FoV scaling is skipped. + target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`): + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing + is performed. Returns: - `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + `Dict[str, List[TensorType]]`: + A dictionary containing: + - `"predicted_depth"`: A list of processed depth tensors. + - `"fov"`: A list of processed FoV values if provided, otherwise `None`. + + Raises: + `ValueError`: + If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched. """ requires_backends(self, "torch") diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index eb8bf02f83d1..b184b5985ba1 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -44,6 +44,13 @@ logger = logging.get_logger(__name__) +# General docstring +_CONFIG_FOR_DOC = "DepthProConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "geetu040/DepthPro" +_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024] + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT class DepthProViTPatchEmbeddings(nn.Module): @@ -942,7 +949,7 @@ def forward( # STEP 8: return these features in order of increasing size as what fusion expects last_hidden_state = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) - *scaled_images_features, + *scaled_images_features, # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) *intermediate_features, ] @@ -1049,14 +1056,7 @@ class PreTrainedModel self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # TODO - # @add_code_sample_docstrings( - # checkpoint=_CHECKPOINT_FOR_DOC, - # output_type=BaseModelOutputWithPoolingAndIntermediateActivations, - # config_class=_CONFIG_FOR_DOC, - # modality="vision", - # expected_output=_EXPECTED_OUTPUT_SHAPE, - # ) + @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -1065,6 +1065,13 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: + r""" + Returns: + + Examples: + TODO + ```python + ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1399,7 +1406,7 @@ def __init__(self, config, use_fov_model=None): @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) - # @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) + @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( self, pixel_values: torch.FloatTensor, @@ -1418,37 +1425,6 @@ def forward( Examples: TODO ```python - >>> from transformers import AutoImageProcessor, DPTForDepthEstimation - >>> import torch - >>> import numpy as np - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large") - >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large") - - >>> # prepare image for the model - >>> inputs = image_processor(images=image, return_tensors="pt") - - >>> with torch.no_grad(): - ... outputs = model(**inputs) - ... predicted_depth = outputs.predicted_depth - - >>> # interpolate to original size - >>> prediction = torch.nn.functional.interpolate( - ... predicted_depth.unsqueeze(1), - ... size=image.size[::-1], - ... mode="bicubic", - ... align_corners=False, - ... ) - - >>> # visualize the prediction - >>> output = prediction.squeeze().cpu().numpy() - >>> formatted = (output * 255 / np.max(output)).astype("uint8") - >>> depth = Image.fromarray(formatted) ```""" loss = None if labels is not None: From d0a8733f275941adb827a4f7e3850c2a28d66006 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:25:43 +0500 Subject: [PATCH 031/151] more formatting --- .../models/depth_pro/image_processing_depth_pro.py | 7 +++---- src/transformers/models/depth_pro/modeling_depth_pro.py | 7 +------ 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 0e3c7d6455b0..21810bfab645 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -15,14 +15,13 @@ """Image processor class for DepthPro.""" from typing import Dict, List, Optional, Union -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np -from icecream import ic from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import resize, to_channel_dimension_format +from ...image_transforms import to_channel_dimension_format from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, @@ -39,7 +38,7 @@ from ...utils import TensorType, filter_out_non_signature_kwargs, logging import math -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union if TYPE_CHECKING: diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index b184b5985ba1..3812f678b43f 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -14,23 +14,18 @@ # limitations under the License. """PyTorch DepthPro model.""" -from icecream import ic - -import collections.abc import math -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import List, Optional, Set, Tuple, Union import torch from torch import nn from dataclasses import dataclass -from ...utils import ModelOutput from ...activations import ACT2FN from ...modeling_outputs import ( BaseModelOutput, DepthEstimatorOutput ) from ...utils import ( - add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging, From e6b385a9edf92a5c7f342935d75ae3e017fe122c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:45:20 +0500 Subject: [PATCH 032/151] formatting with ruff --- .../convert_depth_pro_weights_to_hf.py | 6 +-- .../depth_pro/image_processing_depth_pro.py | 39 ++----------------- .../image_processing_depth_pro_fast.py | 5 ++- .../models/depth_pro/modeling_depth_pro.py | 10 ++--- 4 files changed, 13 insertions(+), 47 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index c3b77f17f04c..66dfff12065a 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -19,13 +19,13 @@ import regex as re import torch from huggingface_hub import hf_hub_download -from transformers.image_utils import PILImageResampling from transformers import ( DepthProConfig, - DepthProImageProcessorFast, DepthProForDepthEstimation, + DepthProImageProcessorFast, ) +from transformers.image_utils import PILImageResampling # fmt: off @@ -126,7 +126,7 @@ def get_qkv_state_dict(key, parameter): ) for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val - return qkv_state_dict + return qkv_state_dict def write_model( hf_repo_id: str, diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 21810bfab645..6c9c7f94e226 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -14,12 +14,10 @@ # limitations under the License. """Image processor class for DepthPro.""" -from typing import Dict, List, Optional, Union -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np - from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import to_channel_dimension_format from ...image_utils import ( @@ -30,43 +28,15 @@ PILImageResampling, infer_channel_dimension_format, is_scaled_image, - make_list_of_images, - to_numpy_array, - valid_images, - pil_torch_interpolation_mapping, -) -from ...utils import TensorType, filter_out_non_signature_kwargs, logging - -import math -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union - - -if TYPE_CHECKING: - from ...modeling_outputs import DepthEstimatorOutput - -import numpy as np - -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict -from ...image_transforms import pad, resize, to_channel_dimension_format -from ...image_utils import ( - IMAGENET_STANDARD_MEAN, - IMAGENET_STANDARD_STD, - ChannelDimension, - ImageInput, - PILImageResampling, - get_image_size, - infer_channel_dimension_format, - is_scaled_image, is_torch_available, - is_torch_tensor, make_list_of_images, + pil_torch_interpolation_mapping, to_numpy_array, valid_images, ) from ...utils import ( TensorType, filter_out_non_signature_kwargs, - is_vision_available, logging, requires_backends, ) @@ -75,9 +45,6 @@ if is_torch_available(): import torch -if is_vision_available(): - import PIL - logger = logging.get_logger(__name__) @@ -379,7 +346,7 @@ def post_process_depth_estimation( Field of view (FoV) values corresponding to each depth prediction. Should have the same length as `predicted_depths` if provided. If `None`, FoV scaling is skipped. target_sizes (`Optional[Union[TensorType, List[Tuple[int, int]], None]]`, *optional*, defaults to `None`): - Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing is performed. diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 3af05df3ccb8..46b502d7d26f 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -15,7 +15,7 @@ """Fast Image processor class for DepthPro.""" import functools -from typing import Dict, List, Optional, Union, Tuple +from typing import Dict, List, Optional, Union from ...image_processing_base import BatchFeature from ...image_processing_utils import get_size_dict @@ -35,6 +35,7 @@ from ...utils import TensorType, logging, requires_backends from ...utils.import_utils import is_torch_available, is_torchvision_available + logger = logging.get_logger(__name__) @@ -325,7 +326,7 @@ def post_process_depth_estimation( Field of view (FoV) values corresponding to each depth prediction. Should have the same length as `predicted_depths` if provided. If `None`, FoV scaling is skipped. target_sizes (`Optional[Union[TensorType, List[tuple[int, int]], None]]`, *optional*, defaults to `None`): - Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` + Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)` or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing is performed. diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3812f678b43f..5b521cfda9bd 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -15,16 +15,16 @@ """PyTorch DepthPro model.""" import math +from dataclasses import dataclass from typing import List, Optional, Set, Tuple, Union import torch from torch import nn -from dataclasses import dataclass from ...activations import ACT2FN -from ...modeling_outputs import ( - BaseModelOutput, DepthEstimatorOutput -) +from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -32,8 +32,6 @@ replace_return_docstrings, torch_int, ) -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from .configuration_depth_pro import DepthProConfig From 267e50fbe2288de71428776adebaea51b902751c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 27 Nov 2024 19:46:50 +0500 Subject: [PATCH 033/151] formatting with style --- src/transformers/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3d0b85e3a1b4..0e6c48762a85 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -5262,6 +5262,7 @@ XLMProphetNetConfig, ) from .models.depth_anything import DepthAnythingConfig + from .models.depth_pro import DepthProConfig from .models.detr import DetrConfig from .models.dinat import DinatConfig from .models.dinov2 import Dinov2Config @@ -5281,7 +5282,6 @@ DPRReaderTokenizer, ) from .models.dpt import DPTConfig - from .models.depth_pro import DepthProConfig from .models.efficientnet import ( EfficientNetConfig, ) @@ -6107,10 +6107,10 @@ from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor + from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor - from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast from .models.efficientnet import EfficientNetImageProcessor from .models.flava import ( FlavaFeatureExtractor, @@ -6872,6 +6872,11 @@ DepthAnythingForDepthEstimation, DepthAnythingPreTrainedModel, ) + from .models.depth_pro import ( + DepthProForDepthEstimation, + DepthProModel, + DepthProPreTrainedModel, + ) from .models.detr import ( DetrForObjectDetection, DetrForSegmentation, @@ -6918,11 +6923,6 @@ DPTModel, DPTPreTrainedModel, ) - from .models.depth_pro import ( - DepthProForDepthEstimation, - DepthProModel, - DepthProPreTrainedModel, - ) from .models.efficientnet import ( EfficientNetForImageClassification, EfficientNetModel, From a1ec99743563ae054ae159a7d83dc76e9c09a4ab Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 28 Nov 2024 00:48:06 +0500 Subject: [PATCH 034/151] fix copied classes --- .../depth_pro/configuration_depth_pro.py | 48 ++-- .../convert_depth_pro_weights_to_hf.py | 44 ++-- .../depth_pro/image_processing_depth_pro.py | 9 +- .../image_processing_depth_pro_fast.py | 9 +- .../models/depth_pro/modeling_depth_pro.py | 225 ++++++++++-------- 5 files changed, 174 insertions(+), 161 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 8bab8227be7e..d938f0a721f1 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -78,22 +78,22 @@ class DepthProConfig(PretrainedConfig): Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, seq_len, hidden_size)`. - intermediate_hook_ids - Indices of the intermediate hidden states from patch_encoder to use for fusion. - intermediate_feature_dims - Hidden state during upsampling for each intermediate hidden states in intermediate_hook_ids. - scaled_images_ratios - Use images of these ratios for patch_encoder. - scaled_images_overlap_ratios - Overlap ratio between patches for each scaled image in scaled_image_ratios. - scaled_images_feature_dims - Hidden state during upsampling for each scaled image in scaled_images_ratios. - use_batch_norm_in_fusion - Whether to use batch normalization in the residual units of the fusion blocks. - use_fov_model - Whether to use `DepthProFOVModel` to generate Field of View. - num_fov_head_layers - No of convolution layers in head of `DepthProFOVModel`. + intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`): + Indices of the intermediate hidden states from the patch encoder to use for fusion. + intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`): + Hidden state dimensions during upsampling for each intermediate hidden state in `intermediate_hook_ids`. + scaled_images_ratios (`List[float]`, *optional*, defaults to `[0.25, 0.5, 1]`): + Ratios of scaled images to be used by the patch encoder. + scaled_images_overlap_ratios (`List[float]`, *optional*, defaults to `[0.0, 0.5, 0.25]`): + Overlap ratios between patches for each scaled image in `scaled_images_ratios`. + scaled_images_feature_dims (`List[int]`, *optional*, defaults to `[1024, 1024, 512]`): + Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. + use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): + Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_fov_model (`bool`, *optional*, defaults to `True`): + Whether to use `DepthProFOVModel` to generate the field of view. + num_fov_head_layers (`int`, *optional*, defaults to `2`): + Number of convolution layers in the head of `DepthProFOVModel`. Example: @@ -134,12 +134,13 @@ def __init__( use_swiglu_ffn=False, apply_layernorm=True, reshape_hidden_states=True, - intermediate_hook_ids = [11, 5], - intermediate_feature_dims = [256, 256], - scaled_images_ratios = [0.25, 0.5, 1], - scaled_images_overlap_ratios = [0.0, 0.5, 0.25], - scaled_images_feature_dims = [1024, 1024, 512], - use_batch_norm_in_fusion=False, + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, use_fov_model=True, num_fov_head_layers=2, **kwargs, @@ -166,7 +167,8 @@ def __init__( self.use_swiglu_ffn = use_swiglu_ffn self.apply_layernorm = apply_layernorm self.reshape_hidden_states = reshape_hidden_states - self.use_batch_norm_in_fusion = use_batch_norm_in_fusion + self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual + self.use_bias_in_fusion_residual = use_bias_in_fusion_residual self.use_fov_model = use_fov_model self.num_fov_head_layers = num_fov_head_layers self.intermediate_hook_ids = intermediate_hook_ids diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 66dfff12065a..377595b746ac 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -93,6 +93,7 @@ } # fmt: on + def convert_old_keys_to_new_keys(state_dict_keys: dict = None): output_dict = {} if state_dict_keys is not None: @@ -106,6 +107,7 @@ def convert_old_keys_to_new_keys(state_dict_keys: dict = None): output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) return output_dict + def get_qkv_state_dict(key, parameter): """ new key which looks like this @@ -117,21 +119,20 @@ def get_qkv_state_dict(key, parameter): xxxx.v.xxxx (m//3, n) """ qkv_state_dict = {} - placeholder = re.search(r'(\(.*?\))', key).group(1) # finds "(query|key|value)" - replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] + placeholder = re.search(r"(\(.*?\))", key).group(1) # finds "(query|key|value)" + replacements_keys = placeholder[1:-1].split("|") # creates ['query', 'key', 'value'] replacements_vals = torch.split( - parameter, - split_size_or_sections=parameter.size(0)//len(replacements_keys), - dim=0 + parameter, split_size_or_sections=parameter.size(0) // len(replacements_keys), dim=0 ) for replacement_key, replacement_val in zip(replacements_keys, replacements_vals): qkv_state_dict[key.replace(placeholder, replacement_key)] = replacement_val return qkv_state_dict + def write_model( hf_repo_id: str, output_dir: str, - safe_serialization: bool=True, + safe_serialization: bool = True, ): os.makedirs(output_dir, exist_ok=True) @@ -162,11 +163,11 @@ def write_model( use_swiglu_ffn=False, apply_layernorm=True, reshape_hidden_states=True, - intermediate_hook_ids = [11, 5], - intermediate_feature_dims = [256, 256], - scaled_images_ratios = [0.25, 0.5, 1], - scaled_images_overlap_ratios = [0.0, 0.5, 0.25], - scaled_images_feature_dims = [1024, 1024, 512], + intermediate_hook_ids=[11, 5], + intermediate_feature_dims=[256, 256], + scaled_images_ratios=[0.25, 0.5, 1], + scaled_images_overlap_ratios=[0.0, 0.5, 0.25], + scaled_images_feature_dims=[1024, 1024, 512], use_batch_norm_in_fusion=False, use_fov_model=True, num_fov_head_layers=2, @@ -215,18 +216,19 @@ def write_model( DepthProForDepthEstimation.from_pretrained(output_dir, torch_dtype=torch.bfloat16, device_map="auto") print("Model reloaded successfully.") + def write_image_processor(output_dir: str): image_processor = DepthProImageProcessorFast( - do_resize = True, - size = {"height": 1536, "width": 1536}, - resample = PILImageResampling.BILINEAR, - antialias = False, - do_rescale = True, - rescale_factor = 1 / 255, - do_normalize = True, - image_mean = 0.5, - image_std = 0.5, - return_tensors = "pt", + do_resize=True, + size={"height": 1536, "width": 1536}, + resample=PILImageResampling.BILINEAR, + antialias=False, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=0.5, + image_std=0.5, + return_tensors="pt", ) image_processor.save_pretrained(output_dir) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 6c9c7f94e226..15a33f804d14 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -371,18 +371,13 @@ def post_process_depth_estimation( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - outputs = { - "predicted_depth": [], - "fov": [] if fovs is not None else None - } + outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None} fovs = [None] * len(predicted_depths) if fovs is None else fovs target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - if target_size is not None: - # scale image w.r.t fov if fov is not None: width = target_size[1] @@ -395,7 +390,7 @@ def post_process_depth_estimation( predicted_depth.unsqueeze(0).unsqueeze(1), size=target_size, resample=self.resample, - antialias=self.antialias + antialias=self.antialias, ).squeeze() # inverse the depth diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 46b502d7d26f..374d5c25cafc 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -161,7 +161,7 @@ def _build_transforms( Resize( (size["height"], size["width"]), interpolation=pil_torch_interpolation_mapping[resample], - antialias=antialias + antialias=antialias, ) ) @@ -351,18 +351,13 @@ def post_process_depth_estimation( "Make sure that you pass in as many fov values as the batch dimension of the predicted depth" ) - outputs = { - "predicted_depth": [], - "fov": [] if fovs is not None else None - } + outputs = {"predicted_depth": [], "fov": [] if fovs is not None else None} fovs = [None] * len(predicted_depths) if fovs is None else fovs target_sizes = [None] * len(predicted_depths) if target_sizes is None else target_sizes for predicted_depth, fov, target_size in zip(predicted_depths, fovs, target_sizes): - if target_size is not None: - # scale image w.r.t fov if fov is not None: width = target_size[1] diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 5b521cfda9bd..77983933a19a 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -40,17 +40,11 @@ # General docstring _CONFIG_FOR_DOC = "DepthProConfig" -# Base docstring -_CHECKPOINT_FOR_DOC = "geetu040/DepthPro" -_EXPECTED_OUTPUT_SHAPE = [1, 577, 1024] - -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings with Dinov2->DepthProViT class DepthProViTPatchEmbeddings(nn.Module): """ - This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial - `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a - Transformer. + Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings + with addition of config parameter patch_embeddings_size """ def __init__(self, config): @@ -60,6 +54,7 @@ def __init__(self, config): self.in_channels = config.num_channels self.out_channels = config.hidden_size self.patch_embeddings_size = config.patch_embeddings_size + self.num_channels = config.num_channels self.projection = nn.Conv2d( self.in_channels, @@ -68,9 +63,10 @@ def __init__(self, config): stride=(self.patch_embeddings_size, self.patch_embeddings_size), ) + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings.forward def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: num_channels = pixel_values.shape[1] - if num_channels != self.config.num_channels: + if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." f" Expected {self.num_channels} but got {num_channels}." @@ -79,11 +75,10 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# Copied from transformers.models.dinov2.modeling_dinov2.DepthProViTEmbeddings -# with DepthProViT->DepthProViT and antialias=True in interpolation class DepthProViTEmbeddings(nn.Module): """ - Construct the CLS token, position and patch embeddings. + Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings + except antialias=True in interpolation and removal of mask_token """ def __init__(self, config: DepthProConfig) -> None: @@ -131,7 +126,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: size=(new_height, new_width), mode="bicubic", align_corners=False, - antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProPatchEmbeddings + antialias=True, # except for this, the class is same as transformers.models.dinov2.modeling_dinov2.DepthProViTPatchEmbeddings ).to(dtype=target_dtype) patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) @@ -155,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro class DepthProViTSelfAttention(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -216,7 +211,7 @@ def forward( return outputs -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SelfAttention with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): def __init__(self, config: DepthProConfig) -> None: super().__init__(config) @@ -226,8 +221,9 @@ def forward( self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. logger.warning_once( - "DepthProModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + "DepthProViTModel is using DepthProViTSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( @@ -257,7 +253,7 @@ def forward( return context_layer, None -# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTSelfOutput(nn.Module): """ The residual connection is defined in DepthProViTLayer instead of here (as is the case with other models), due to the @@ -276,7 +272,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTAttention(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -316,14 +312,14 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTSdpaAttention(DepthProViTAttention): def __init__(self, config: DepthProConfig) -> None: super().__init__(config) self.attention = DepthProViTSdpaSelfAttention(config) -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaAttention with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2LayerScale with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTLayerScale(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -369,7 +365,7 @@ def extra_repr(self) -> str: return "p={}".format(self.drop_prob) -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2MLP with Dinov2->DepthPro class DepthProViTMLP(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -389,7 +385,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SwiGLUFFN with Dinov2->DepthPro class DepthProViTSwiGLUFFN(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -413,7 +409,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: } -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2->DepthProViT +# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing class DepthProViTLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" @@ -465,7 +461,7 @@ def forward( return outputs -# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->DepthProViT +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -569,14 +565,14 @@ def forward( class DepthProUpsampleBlock(nn.Module): def __init__( - self, - input_dims, - intermediate_dims, - output_dims, - n_upsample_layers, - use_proj=True, - bias=False, - ) -> None: + self, + input_dims, + intermediate_dims, + output_dims, + n_upsample_layers, + use_proj=True, + bias=False, + ) -> None: super().__init__() # create first projection block @@ -620,6 +616,7 @@ def interpolate(pixel_values, scale_factor): align_corners=False, ) + def patch(pixel_values, patch_size, overlap_ratio): """Creates Patches from Batch.""" B, C, W, H = pixel_values.shape @@ -631,9 +628,7 @@ def patch(pixel_values, patch_size, overlap_ratio): stride = int(patch_size * (1 - overlap_ratio)) # (B, C, W, H) - patches = torch.nn.functional.unfold( - pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride) - ) + patches = torch.nn.functional.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride)) # patches.shape (B, patch_size**2 * C, num_patches) patches = patches.permute(2, 0, 1) # patches.shape (num_patches, B, patch_size**2 * C) @@ -642,11 +637,12 @@ def patch(pixel_values, patch_size, overlap_ratio): return patches + def reshape_feature(hidden_states, width, height): """Discard class token and reshape 1D feature map to a 2D grid.""" B, _, C = hidden_states.shape # (B, WH+1, C) - hidden_states = hidden_states[:, 1:, :] # remove class token + hidden_states = hidden_states[:, 1:, :] # remove class token # (B, WH, C) hidden_states = hidden_states.reshape(B, width, height, C) # (B, W, H, C) @@ -654,6 +650,7 @@ def reshape_feature(hidden_states, width, height): # (B, C, W, H) return hidden_states + def merge(patches, batch_size, merge_out_size): """Recreates Batch from Patches.""" num_patches, num_channels, out_size, out_size = patches.shape @@ -668,7 +665,7 @@ def merge(patches, batch_size, merge_out_size): merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) """ - padding = ( box_size * out_size - merge_out_size ) // ( 2 * box_size - 2 ) + padding = (box_size * out_size - merge_out_size) // (2 * box_size - 2) i = 0 boxes = [] @@ -685,10 +682,10 @@ def merge(patches, batch_size, merge_out_size): box = box[..., :, padding:] if h != box_size - 1: # remove pad from height if box is not at bottom border - box = box[..., :box.shape[-2]-padding, :] + box = box[..., : box.shape[-2] - padding, :] if w != box_size - 1: # remove pad from width if box is not at right border - box = box[..., :, :box.shape[-1]-padding] + box = box[..., :, : box.shape[-1] - padding] boxes_in_row.append(box) i += 1 @@ -717,13 +714,12 @@ def __init__(self, config: DepthProConfig) -> None: self.n_scaled_images = len(self.scaled_images_ratios) self.n_intermediate_hooks = len(self.intermediate_hook_ids) self.out_size = config.patch_size // config.patch_embeddings_size - self.seq_len = self.out_size ** 2 # each patch is flattened + self.seq_len = self.out_size**2 # each patch is flattened # config.scaled_images_ratios is sorted if config.scaled_images_ratios != sorted(config.scaled_images_ratios): raise ValueError( - f"Values in scaled_images_ratios={config.scaled_images_ratios} " - "should be sorted from low to high" + f"Values in scaled_images_ratios={config.scaled_images_ratios} " "should be sorted from low to high" ) # lowest image resolution is greator than the patch_size @@ -767,7 +763,7 @@ def __init__(self, config: DepthProConfig) -> None: input_dims=config.hidden_size, intermediate_dims=intermediate_dims, output_dims=feature_dims, - n_upsample_layers=2+i, + n_upsample_layers=2 + i, ) self.upsample_intermediate.append(upsample_block) @@ -783,7 +779,7 @@ def __init__(self, config: DepthProConfig) -> None: # for STEP 7: fuse low_res and image features self.fuse_image_with_low_res = nn.Conv2d( - in_channels=config.scaled_images_feature_dims[0]*2, + in_channels=config.scaled_images_feature_dims[0] * 2, out_channels=config.scaled_images_feature_dims[0], kernel_size=1, stride=1, @@ -838,7 +834,7 @@ def forward( overlap_ratio=self.scaled_images_overlap_ratios[i], ) scaled_images_num_patches = [len(i) for i in scaled_images] - patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first + patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first # (sum(scaled_images_num_patches), config.num_channels, config.patch_size, config.patch_size) # STEP 3: apply patch and image encoder @@ -847,16 +843,15 @@ def forward( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=True, # required for intermediate features + output_hidden_states=True, # required for intermediate features return_dict=True, ) scaled_images_last_hidden_state = torch.split_with_sizes( - patch_encodings.last_hidden_state, - scaled_images_num_patches[::-1] - )[::-1] # -1 as patch encoder expects high res patches first + patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] + )[::-1] # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, @@ -874,12 +869,12 @@ def forward( # b. reshape back to image like features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) + ) # (scaled_images_num_patches[i], config.num_channels, self.out_size, self.out_size) # c. merge patches back together features = merge( - features, batch_size=B, merge_out_size=self.out_size*2**i - ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) + features, batch_size=B, merge_out_size=self.out_size * 2**i + ) # (B, config.hidden_size, self.out_size*2**i, self.out_size*2**i) # d. upsample features = self.upsample_scaled_images[i](features) @@ -891,11 +886,14 @@ def forward( intermediate_features = [] for i in range(self.n_intermediate_hooks): - # a. extract hidden_state - layer_id = self.intermediate_hook_ids[i] + 1 # +1 to correct index position as hidden_states contain embedding output as well + layer_id = ( + self.intermediate_hook_ids[i] + 1 + ) # +1 to correct index position as hidden_states contain embedding output as well hidden_state = patch_encodings.hidden_states[layer_id] - hidden_state = hidden_state[:scaled_images_num_patches[-1]] # num_patches to be of same length as highest resolution + hidden_state = hidden_state[ + : scaled_images_num_patches[-1] + ] # num_patches to be of same length as highest resolution # (scaled_images_num_patches[-1], self.seq_len+1, config.hidden_size) # b. reshape back to image like @@ -903,12 +901,14 @@ def forward( hidden_state, self.out_size, self.out_size, - ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[-1], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together features = merge( - features, batch_size=B, merge_out_size=self.out_size*2**(self.n_scaled_images-1), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (self.n_scaled_images - 1), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample features = self.upsample_intermediate[i](features) @@ -919,20 +919,26 @@ def forward( # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = image_encodings.last_hidden_state # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + hidden_state = ( + image_encodings.last_hidden_state + ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together image_features = merge( - image_features, batch_size=B, merge_out_size=self.out_size*2**(0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + image_features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample - image_features = self.upsample_image(image_features) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) + image_features = self.upsample_image( + image_features + ) # (B, config.scaled_images_feature_dims[0], self.out_size*2**1, self.out_size*2**1) # STEP 7: apply fusion (global_features = image_features + scaled_images_features[0]) # fuses image_features with lowest resolution features as they are of same size @@ -1089,37 +1095,49 @@ def forward( return encodings -# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPTPreAct->DepthPro -class DepthProResidualLayer(nn.Module): +# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro +class DepthProPreActResidualLayer(nn.Module): + """ + ResidualConvUnit, pre-activate residual unit. + + Args: + config (`[DepthProConfig]`): + Model configuration class defining the model architecture. + """ + def __init__(self, config): super().__init__() - self.use_batch_norm = config.use_batch_norm_in_fusion - self.hidden_size = config.fusion_hidden_size + self.use_batch_norm = config.use_batch_norm_in_fusion_residual + use_bias_in_fusion_residual = ( + config.use_bias_in_fusion_residual + if config.use_bias_in_fusion_residual is not None + else not self.use_batch_norm + ) self.activation1 = nn.ReLU() self.convolution1 = nn.Conv2d( - self.hidden_size, - self.hidden_size, + config.fusion_hidden_size, + config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=(not self.use_batch_norm), + bias=use_bias_in_fusion_residual, ) self.activation2 = nn.ReLU() self.convolution2 = nn.Conv2d( - self.hidden_size, - self.hidden_size, + config.fusion_hidden_size, + config.fusion_hidden_size, kernel_size=3, stride=1, padding=1, - bias=(not self.use_batch_norm), + bias=use_bias_in_fusion_residual, ) if self.use_batch_norm: - self.batch_norm1 = nn.BatchNorm2d(self.hidden_size) - self.batch_norm2 = nn.BatchNorm2d(self.hidden_size) + self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size) + self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size) def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: residual = hidden_state @@ -1139,15 +1157,16 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return hidden_state + residual -# Implementation resembles transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer +# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs) class DepthProFeatureFusionLayer(nn.Module): - def __init__(self, config: DepthProConfig, use_deconv:bool=True) -> None: + def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: super().__init__() self.config = config self.use_deconv = use_deconv - self.residual_layer1 = DepthProResidualLayer(config) - self.residual_layer2 = DepthProResidualLayer(config) + self.residual_layer1 = DepthProPreActResidualLayer(config) + self.residual_layer2 = DepthProPreActResidualLayer(config) if self.use_deconv: self.deconv = nn.ConvTranspose2d( @@ -1174,13 +1193,14 @@ def forward(self, hidden_state, residual=None): return hidden_state -# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro with extra layer parameters +# Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro +# with extra layer parameters, deconv and reversed layers class DepthProFeatureFusionStage(nn.Module): def __init__(self, config, num_layers): super().__init__() self.num_layers = num_layers self.layers = nn.ModuleList() - for _ in range(self.num_layers-1): + for _ in range(self.num_layers - 1): self.layers.append(DepthProFeatureFusionLayer(config)) # final layer doesnot require deconvolution self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False)) @@ -1214,7 +1234,7 @@ def __init__(self, config: DepthProConfig) -> None: self.encoder_neck = nn.Linear(self.hidden_size, self.fusion_hidden_size // 2) self.global_neck = nn.Sequential( nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True) + nn.ReLU(True), ) if config.fusion_hidden_size // 2**config.num_fov_head_layers == 0: @@ -1227,19 +1247,21 @@ def __init__(self, config: DepthProConfig) -> None: self.head = nn.Sequential() for i in range(config.num_fov_head_layers): self.head.append( - nn.Conv2d(self.fusion_hidden_size // 2**(i+1), self.fusion_hidden_size // 2**(i+2), kernel_size=3, stride=2, padding=1) + nn.Conv2d( + self.fusion_hidden_size // 2 ** (i + 1), + self.fusion_hidden_size // 2 ** (i + 2), + kernel_size=3, + stride=2, + padding=1, + ) ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.fusion_hidden_size // 2**(config.num_fov_head_layers+1) + final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( - in_channels=final_in_channels, - out_channels=1, - kernel_size=final_kernal_size, - stride=1, - padding=0 + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0 ) ) @@ -1263,7 +1285,7 @@ def forward( # follow the steps same as with image features in DepthProEncoder pixel_values = interpolate( pixel_values, - scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image + scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image ) patches = patch( pixel_values, @@ -1279,11 +1301,7 @@ def forward( ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.encoder_neck(last_hidden_state) - last_hidden_state = reshape_feature( - last_hidden_state, - width=self.out_size, - height=self.out_size - ) + last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) last_hidden_state = merge( last_hidden_state, batch_size=B, @@ -1321,12 +1339,11 @@ def __init__(self, config): features = config.fusion_hidden_size self.head = nn.Sequential( - nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1), + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( - in_channels=features//2, out_channels=features//2, - kernel_size=2, stride=2, padding=0, bias=True + in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True ), - nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(), @@ -1347,6 +1364,7 @@ class DepthProDepthEstimatorOutput(DepthEstimatorOutput): fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. """ + fov: Optional[torch.FloatTensor] = None @@ -1369,7 +1387,7 @@ def __init__(self, config, use_fov_model=None): combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims self.projections = nn.ModuleList() for i, in_channels in enumerate(combined_feature_dims): - if i == len(combined_feature_dims)-1 and in_channels == config.fusion_hidden_size: + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: # projection for last layer can be ignored if input and output channels already match self.projections.append(nn.Identity()) else: @@ -1397,7 +1415,6 @@ def __init__(self, config, use_fov_model=None): # Initialize weights and apply final processing self.post_init() - @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DepthProDepthEstimatorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1454,7 +1471,9 @@ def forward( ) fov = fov_encodings.last_hidden_state attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + hidden_states = ( + depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None + ) else: fov = None attentions = depth_pro_outputs.attentions From 3c656f24a5e33fed84663f2c0d45053b2b3c4e91 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 28 Nov 2024 01:29:54 +0500 Subject: [PATCH 035/151] add examples; update weight convert script --- .../convert_depth_pro_weights_to_hf.py | 4 +- .../models/depth_pro/modeling_depth_pro.py | 58 ++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 377595b746ac..cd06a99c5fb2 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -168,7 +168,8 @@ def write_model( scaled_images_ratios=[0.25, 0.5, 1], scaled_images_overlap_ratios=[0.0, 0.5, 0.25], scaled_images_feature_dims=[1024, 1024, 512], - use_batch_norm_in_fusion=False, + use_batch_norm_in_fusion_residual=False, + use_bias_in_fusion_residual=True, use_fov_model=True, num_fov_head_layers=2, ) @@ -228,7 +229,6 @@ def write_image_processor(output_dir: str): do_normalize=True, image_mean=0.5, image_std=0.5, - return_tensors="pt", ) image_processor.save_pretrained(output_dir) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 77983933a19a..255174de0993 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1068,8 +1068,34 @@ def forward( Returns: Examples: - TODO + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, DepthProModel + >>> + >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> + >>> checkpoint = "geetu040/DepthPro" + >>> processor = AutoProcessor.from_pretrained(checkpoint) + >>> model = DepthProModel.from_pretrained(checkpoint) + >>> + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + >>> + >>> with torch.no_grad(): + ... output = model(**inputs) + ... + >>> for state in output.last_hidden_state: + ... print(state.shape) + ... + torch.Size([1, 1024, 48, 48]) + torch.Size([1, 1024, 96, 96]) + torch.Size([1, 512, 192, 192]) + torch.Size([1, 256, 384, 384]) + torch.Size([1, 256, 768, 768]) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1433,8 +1459,36 @@ def forward( Returns: Examples: - TODO + ```python + >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation + >>> import torch + >>> from PIL import Image + >>> import requests + >>> + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> + >>> checkpoint = "geetu040/DepthPro" + >>> processor = AutoImageProcessor.from_pretrained(checkpoint) + >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + >>> + >>> # prepare image for the model + >>> inputs = processor(images=image, return_tensors="pt") + >>> + >>> with torch.no_grad(): + ... outputs = model(**inputs) + ... + >>> # interpolate to original size + >>> post_processed_output = processor.post_process_depth_estimation( + ... outputs.predicted_depth, outputs.fov, target_sizes=[(image.height, image.width)], + ... ) + >>> + >>> # visualize the prediction + >>> predicted_depth = post_processed_output["predicted_depth"][0] + >>> depth = predicted_depth * 255 / predicted_depth.max() + >>> depth = depth.detach().cpu().numpy() + >>> depth = Image.fromarray(depth.astype("uint8")) ```""" loss = None if labels is not None: From f6f6d3d130b97519b8f9bf0ae9413301f655ecd9 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 10:08:56 +0500 Subject: [PATCH 036/151] fix using check_table.py and isort --- docs/source/en/index.md | 1 + src/transformers/__init__.py | 18 ++++++++-------- .../models/auto/configuration_auto.py | 4 ++-- .../models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 6 +++--- .../models/gemma/configuration_gemma.py | 1 - src/transformers/utils/dummy_pt_objects.py | 21 +++++++++++++++++++ .../utils/dummy_vision_objects.py | 14 +++++++++++++ 8 files changed, 51 insertions(+), 16 deletions(-) diff --git a/docs/source/en/index.md b/docs/source/en/index.md index aaff45ab65df..d316e89ce6f4 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -117,6 +117,7 @@ Flax), PyTorch, and/or TensorFlow. | [DeiT](model_doc/deit) | ✅ | ✅ | ❌ | | [DePlot](model_doc/deplot) | ✅ | ❌ | ❌ | | [Depth Anything](model_doc/depth_anything) | ✅ | ❌ | ❌ | +| [DepthPro](model_doc/depth_pro) | ✅ | ❌ | ❌ | | [DETA](model_doc/deta) | ✅ | ❌ | ❌ | | [DETR](model_doc/detr) | ✅ | ❌ | ❌ | | [DialoGPT](model_doc/dialogpt) | ✅ | ✅ | ✅ | diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0e6c48762a85..d4ac4b5fd866 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -387,6 +387,7 @@ "models.deprecated.vit_hybrid": ["ViTHybridConfig"], "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"], "models.depth_anything": ["DepthAnythingConfig"], + "models.depth_pro": ["DepthProConfig"], "models.detr": ["DetrConfig"], "models.dialogpt": [], "models.dinat": ["DinatConfig"], @@ -408,7 +409,6 @@ "DPRReaderTokenizer", ], "models.dpt": ["DPTConfig"], - "models.depth_pro": ["DepthProConfig"], "models.efficientnet": ["EfficientNetConfig"], "models.electra": [ "ElectraConfig", @@ -1193,10 +1193,10 @@ _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) + _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) - _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) @@ -2078,6 +2078,13 @@ "DepthAnythingPreTrainedModel", ] ) + _import_structure["models.depth_pro"].extend( + [ + "DepthProForDepthEstimation", + "DepthProModel", + "DepthProPreTrainedModel", + ] + ) _import_structure["models.detr"].extend( [ "DetrForObjectDetection", @@ -2138,13 +2145,6 @@ "DPTPreTrainedModel", ] ) - _import_structure["models.depth_pro"].extend( - [ - "DepthProForDepthEstimation", - "DepthProModel", - "DepthProPreTrainedModel", - ] - ) _import_structure["models.efficientnet"].extend( [ "EfficientNetForImageClassification", diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index d8860d38f850..a02af514b65a 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -85,6 +85,7 @@ ("deformable_detr", "DeformableDetrConfig"), ("deit", "DeiTConfig"), ("depth_anything", "DepthAnythingConfig"), + ("depth_pro", "DepthProConfig"), ("deta", "DetaConfig"), ("detr", "DetrConfig"), ("dinat", "DinatConfig"), @@ -93,7 +94,6 @@ ("donut-swin", "DonutSwinConfig"), ("dpr", "DPRConfig"), ("dpt", "DPTConfig"), - ("depth_pro", "DepthProConfig"), ("efficientformer", "EfficientFormerConfig"), ("efficientnet", "EfficientNetConfig"), ("electra", "ElectraConfig"), @@ -385,6 +385,7 @@ ("deplot", "DePlot"), ("depth_anything", "Depth Anything"), ("depth_anything_v2", "Depth Anything V2"), + ("depth_pro", "DepthPro"), ("deta", "DETA"), ("detr", "DETR"), ("dialogpt", "DialoGPT"), @@ -395,7 +396,6 @@ ("donut-swin", "DonutSwin"), ("dpr", "DPR"), ("dpt", "DPT"), - ("depth_pro", "DepthPro"), ("efficientformer", "EfficientFormer"), ("efficientnet", "EfficientNet"), ("electra", "ELECTRA"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index e7b53f30a7a0..3887f29415b0 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -71,13 +71,13 @@ ("deformable_detr", ("DeformableDetrImageProcessor",)), ("deit", ("DeiTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)), + ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("deta", ("DetaImageProcessor",)), ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), ("dinov2", ("BitImageProcessor",)), ("donut-swin", ("DonutImageProcessor",)), ("dpt", ("DPTImageProcessor",)), - ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), ("efficientformer", ("EfficientFormerImageProcessor",)), ("efficientnet", ("EfficientNetImageProcessor",)), ("flava", ("FlavaImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 4cc15ca4ca51..b8bcd0cbcb00 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -84,6 +84,7 @@ ("decision_transformer", "DecisionTransformerModel"), ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), ("dinat", "DinatModel"), @@ -92,7 +93,6 @@ ("donut-swin", "DonutSwinModel"), ("dpr", "DPRQuestionEncoder"), ("dpt", "DPTModel"), - ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("electra", "ElectraModel"), @@ -567,12 +567,12 @@ ("data2vec-vision", "Data2VecVisionModel"), ("deformable_detr", "DeformableDetrModel"), ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), ("deta", "DetaModel"), ("detr", "DetrModel"), ("dinat", "DinatModel"), ("dinov2", "Dinov2Model"), ("dpt", "DPTModel"), - ("depth_pro", "DepthProModel"), ("efficientformer", "EfficientFormerModel"), ("efficientnet", "EfficientNetModel"), ("focalnet", "FocalNetModel"), @@ -867,8 +867,8 @@ [ # Model for depth estimation mapping ("depth_anything", "DepthAnythingForDepthEstimation"), - ("dpt", "DPTForDepthEstimation"), ("depth_pro", "DepthProForDepthEstimation"), + ("dpt", "DPTForDepthEstimation"), ("glpn", "GLPNForDepthEstimation"), ("zoedepth", "ZoeDepthForDepthEstimation"), ] diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e170803cccab..346f386ba698 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,7 +20,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 36e1ff2cfe65..dc32f6d653d6 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -3457,6 +3457,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class DepthProForDepthEstimation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DepthProModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class DepthProPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class DetrForObjectDetection(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 19cf02a4e858..1ceb9e227bb2 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -177,6 +177,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class DepthProImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + +class DepthProImageProcessorFast(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class DetrFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] From b4575d026de8a8ca69650c76ab3b21f22e860a48 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 10:45:19 +0500 Subject: [PATCH 037/151] fix config docstring --- .../models/depth_pro/configuration_depth_pro.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d938f0a721f1..d48d68b832b4 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -38,7 +38,7 @@ class DepthProConfig(PretrainedConfig): The number of channels before fusion. num_hidden_layers (`int`, *optional*, defaults to 24): Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 12): + num_attention_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. mlp_ratio (`int`, *optional*, defaults to 4): Ratio of the hidden size of the MLPs relative to the `hidden_size`. @@ -58,7 +58,7 @@ class DepthProConfig(PretrainedConfig): To generate depth of same size as image, image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - patch_size (`int`, *optional*, defaults to 14): + patch_size (`int`, *optional*, defaults to 384): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. @@ -90,9 +90,11 @@ class DepthProConfig(PretrainedConfig): Hidden state dimensions during upsampling for each scaled image in `scaled_images_ratios`. use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`): Whether to use batch normalization in the pre-activate residual units of the fusion blocks. + use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): + Whether to use bias in the pre-activate residual units of the fusion blocks. use_fov_model (`bool`, *optional*, defaults to `True`): Whether to use `DepthProFOVModel` to generate the field of view. - num_fov_head_layers (`int`, *optional*, defaults to `2`): + num_fov_head_layers (`int`, *optional*, defaults to 2): Number of convolution layers in the head of `DepthProFOVModel`. Example: From c8d8a9e0ca3750cc062fe9ad3b90fdbe5a893f0b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 29 Nov 2024 11:26:12 +0500 Subject: [PATCH 038/151] add depth pro to sdpa docs --- docs/source/en/perf_infer_gpu_one.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 67bd31fdaeed..4f1ccc9c427c 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -227,6 +227,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel) +* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel) * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader) From 77873de8a34447d64d16e1a5def4ba8fb7109bb5 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Fri, 29 Nov 2024 15:30:42 +0500 Subject: [PATCH 039/151] undo unintentional changes in configuration_gemma.py --- src/transformers/models/gemma/configuration_gemma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 346f386ba698..e170803cccab 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,6 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from ...configuration_utils import PretrainedConfig From 5f2378d112193317902a733d13b21fc081fc8b56 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 30 Nov 2024 23:51:55 +0500 Subject: [PATCH 040/151] minor fixes --- src/transformers/models/__init__.py | 1 + .../depth_pro/image_processing_depth_pro.py | 24 +++++++++++-------- .../models/depth_pro/modeling_depth_pro.py | 7 +----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 9155f629e63f..fc26362dd64d 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -69,6 +69,7 @@ deit, deprecated, depth_anything, + depth_pro, detr, dialogpt, dinat, diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 15a33f804d14..746f246fcd73 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -113,7 +113,7 @@ def __init__( def resize( self, - images: List[np.ndarray], + image: np.ndarray, size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, antialias: bool = False, @@ -125,8 +125,8 @@ def resize( Resize an image to `(size["height"], size["width"])`. Args: - images (`List[np.ndarray]`): - Images to resize. + image (`np.ndarray`): + Image to resize. size (`Dict[str, int]`): Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): @@ -157,16 +157,13 @@ def resize( raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - images = np.stack(images) - images = torch.from_numpy(images) - return torch.nn.functional.interpolate( # input should be (B, C, H, W) - input=images, + input=torch.from_numpy(image).unsqueeze(0), size=output_size, mode=pil_torch_interpolation_mapping[resample].value, antialias=antialias, - ) + ).squeeze(0).numpy() def _validate_input_arguments( self, @@ -321,8 +318,15 @@ def preprocess( # depth-pro scales the image before resizing it # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: - images = self.resize(images, size=size_dict, resample=resample, antialias=antialias) - images = images.numpy() + images = [ + self.resize( + image=image, + size=size, + resample=resample, + antialias=antialias, + ) + for image in images + ] data = {"pixel_values": images} return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 255174de0993..16601f9c7c86 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -660,7 +660,7 @@ def merge(patches, batch_size, merge_out_size): # patches are not created when scaled image size is equal to patch size return patches - box_size = int(math.sqrt(num_patches // batch_size)) + box_size = math.ceil(math.sqrt(num_patches // batch_size)) """ merge_out_size = (box_size - 2) * (out_size - 2 * padding) + (2) * (out_size - padding) padding = (merge_out_size - box_size * out_size) / (6 - 2 * box_size) @@ -806,11 +806,6 @@ def forward( B, C, H, W = pixel_values.shape - if not (H == W == self.config.image_size): - raise ValueError( - f"Height={H} and Width={W} doesnot match the specified image_size={self.config.image_size} in config." - ) - if not (C == self.config.num_channels): raise ValueError( f"Found {C} channels in image, expected number of channels is {self.config.num_channels} from config." From d51d0b198824370c47650ca6cc49f403e9c752cc Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 30 Nov 2024 23:57:26 +0500 Subject: [PATCH 041/151] test image processing --- .../test_image_processing_depth_pro.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tests/models/depth_pro/test_image_processing_depth_pro.py diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py new file mode 100644 index 000000000000..eea9ed01378d --- /dev/null +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -0,0 +1,113 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers.file_utils import is_vision_available +from transformers.testing_utils import require_torch, require_vision + +from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs + + +if is_vision_available(): + from transformers import DepthProImageProcessor, DepthProImageProcessorFast + + +class DepthProImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + ): + super().__init__() + size = size if size is not None else {"height": 18, "width": 18} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_normalize": self.do_normalize, + "do_resize": self.do_resize, + "size": self.size, + } + + def expected_output_image_shape(self, images): + return self.num_channels, self.size["height"], self.size["width"] + + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = DepthProImageProcessor if is_vision_available() else None + fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = DepthProImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "resample")) + self.assertTrue(hasattr(image_processing, "antialias")) + + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"height": 18, "width": 18}) + + image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) + self.assertEqual(image_processor.size, {"height": 42, "width": 42}) From 082b05555df1b7b55335d6790582f47b0e6c4ca1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 02:01:42 +0500 Subject: [PATCH 042/151] fixes and tests --- docs/source/en/model_doc/depth_pro.md | 119 +++++++ .../depth_pro/configuration_depth_pro.py | 2 +- .../depth_pro/image_processing_depth_pro.py | 1 - .../models/depth_pro/modeling_depth_pro.py | 177 +++++---- tests/models/depth_pro/__init__.py | 0 .../depth_pro/test_modeling_depth_pro.py | 335 ++++++++++++++++++ 6 files changed, 558 insertions(+), 76 deletions(-) create mode 100644 docs/source/en/model_doc/depth_pro.md create mode 100644 tests/models/depth_pro/__init__.py create mode 100644 tests/models/depth_pro/test_modeling_depth_pro.py diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md new file mode 100644 index 000000000000..6472cc506dae --- /dev/null +++ b/docs/source/en/model_doc/depth_pro.md @@ -0,0 +1,119 @@ + + +# DepthPro + +## Overview + +The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun. + +It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt) like decoder. + +The abstract from the paper is the following: + +*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* + + + + DepthPro architecture. Taken from the original paper. + +This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro). + + + +## Usage tips + +```python +from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation + +# initialize with a Transformer-based backbone such as DINOv2 +# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width) +backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False) + +config = DepthProConfig(backbone_config=backbone_config) +model = DepthProForDepthEstimation(config=config) +``` + +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import ViTForImageClassification +model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference. + +| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) | +|--------------|-------------------------------------------|-------------------------------------------|------------------------------| +| 1 | 7 | 6 | 1.17 | +| 2 | 8 | 6 | 1.33 | +| 4 | 8 | 6 | 1.33 | +| 8 | 8 | 6 | 1.33 | + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. + +- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro). + +- [Semantic segmentation task guide](../tasks/semantic_segmentation) +- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +## DepthProConfig + +[[autodoc]] DepthProConfig + +## DepthProFeatureExtractor + +[[autodoc]] DepthProFeatureExtractor + - __call__ + - post_process_semantic_segmentation + +## DepthProImageProcessor + +[[autodoc]] DepthProImageProcessor + - preprocess + - post_process_semantic_segmentation + +## DepthProModel + +[[autodoc]] DepthProModel + - forward + +## DepthProForDepthEstimation + +[[autodoc]] DepthProForDepthEstimation + - forward + +## DepthProForSemanticSegmentation + +[[autodoc]] DepthProForSemanticSegmentation + - forward diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index d48d68b832b4..beb3215d8ddf 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -56,7 +56,7 @@ class DepthProConfig(PretrainedConfig): image_size (`int`, *optional*, defaults to 1536): The size (resolution) of each image, To generate depth of same size as image, - image_size / 2**n_fusion_blocks == patch_size / patch_embeddings_size + image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size where n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) patch_size (`int`, *optional*, defaults to 384): The size (resolution) of each patch. diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 746f246fcd73..65a29900c637 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -264,7 +264,6 @@ def preprocess( image_std = image_std if image_std is not None else self.image_std size = size if size is not None else self.size - size_dict = get_size_dict(size) images = make_list_of_images(images) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 16601f9c7c86..2e074588d4e3 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -31,6 +31,7 @@ logging, replace_return_docstrings, torch_int, + ModelOutput, ) from .configuration_depth_pro import DepthProConfig @@ -87,9 +88,9 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.seq_len = (config.patch_size // config.patch_embeddings_size) ** 2 - self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) self.patch_embeddings = DepthProViTPatchEmbeddings(config) - self.position_embeddings = nn.Parameter(torch.randn(1, self.seq_len + 1, config.hidden_size)) + self.position_embeddings = nn.Parameter(torch.zeros(1, self.seq_len + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: @@ -698,6 +699,35 @@ def merge(patches, batch_size, merge_out_size): return boxes +@dataclass +class DepthProOutput(ModelOutput): + """ + Base class for DepthPro's outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + features (`List[torch.FloatTensor]`, *optional*: + Features from scaled images and hidden_states. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor = None + features: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + class DepthProEncoder(nn.Module): def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -794,7 +824,7 @@ def forward( output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: + ) -> Union[tuple, DepthProOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -848,8 +878,8 @@ def forward( image_encodings = self.image_encoder( pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, return_dict=True, ) @@ -941,21 +971,36 @@ def forward( scaled_images_features[0] = self.fuse_image_with_low_res(scaled_images_features[0]) # STEP 8: return these features in order of increasing size as what fusion expects - last_hidden_state = [ + features = [ # (B, self.scaled_images_feature_dims[i], self.out_size*2**(i+1), self.out_size*2**(i+1)) *scaled_images_features, # (B, config.intermediate_feature_dims[i], self.out_size*2**(self.n_scaled_images+i+1), self.out_size*2**(self.n_scaled_images+i+1)) *intermediate_features, ] - hidden_states = patch_encodings.hidden_states + image_encodings.hidden_states if output_hidden_states else None - attentions = patch_encodings.attentions + image_encodings.attentions if output_attentions else None + # prepare last_hidden_state, hidden_states, attentions from patches to batches + + last_hidden_state = patch_encodings.last_hidden_state + hidden_states = patch_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions if output_attentions else None + + num_patches = sum(scaled_images_num_patches) + # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 + indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T + indexes = indexes.to(last_hidden_state.device) + + last_hidden_state = last_hidden_state[indexes].mean(1) + if hidden_states is not None: + hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) + if attentions is not None: + attentions = tuple([state[indexes].mean(1) for state in attentions]) if not return_dict: - return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None) + return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) - return BaseModelOutput( + return DepthProOutput( last_hidden_state=last_hidden_state, + features=features, hidden_states=hidden_states, attentions=attentions, ) @@ -1034,11 +1079,7 @@ def __init__(self, config): self.post_init() def get_input_embeddings(self): - embeddings = { - "patch_embeddings": self.encoder.patch_encoder.embeddings.patch_embeddings, - "image_embeddings": self.encoder.image_encoder.embeddings.patch_embeddings, - } - return embeddings + return self.encoder.patch_encoder.embeddings.patch_embeddings def _prune_heads(self, heads_to_prune): """ @@ -1058,7 +1099,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[Tuple, DepthProOutput]: r""" Returns: @@ -1215,7 +1256,7 @@ def forward(self, hidden_state, residual=None): # Take from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro -# with extra layer parameters, deconv and reversed layers +# with num_layers, deconv and reversed layers class DepthProFeatureFusionStage(nn.Module): def __init__(self, config, num_layers): super().__init__() @@ -1269,8 +1310,8 @@ def __init__(self, config: DepthProConfig) -> None: for i in range(config.num_fov_head_layers): self.head.append( nn.Conv2d( - self.fusion_hidden_size // 2 ** (i + 1), - self.fusion_hidden_size // 2 ** (i + 2), + math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), + math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), kernel_size=3, stride=2, padding=1, @@ -1278,7 +1319,7 @@ def __init__(self, config: DepthProConfig) -> None: ) self.head.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer - final_in_channels = self.fusion_hidden_size // 2 ** (config.num_fov_head_layers + 1) + final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) final_kernal_size = int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.head.append( nn.Conv2d( @@ -1291,16 +1332,7 @@ def forward( pixel_values: torch.Tensor, global_features: torch.Tensor, head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, BaseModelOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - + ) -> torch.Tensor: B, C, W, H = pixel_values.shape # follow the steps same as with image features in DepthProEncoder @@ -1316,11 +1348,11 @@ def forward( encoder_outputs = self.encoder( patches, head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, + return_dict=True, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.encoder_neck(last_hidden_state) last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) last_hidden_state = merge( @@ -1335,15 +1367,7 @@ def forward( fov_output = self.head(last_hidden_state) fov_output = fov_output.reshape(B) - if not return_dict: - head_outputs = (fov_output,) - return head_outputs + encoder_outputs[1:] - - return BaseModelOutput( - last_hidden_state=fov_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) + return fov_output class DepthProDepthEstimationHead(nn.Module): @@ -1377,16 +1401,36 @@ def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: @dataclass -class DepthProDepthEstimatorOutput(DepthEstimatorOutput): +class DepthProDepthEstimatorOutput(ModelOutput): """ - Base class for outputs of DepthProDepthEstimator. + Base class for DepthProForDepthEstimation's output. Args: - fov (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `use_fov_model` is provided): + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): + Predicted depth for each pixel. + fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. """ + loss: Optional[torch.FloatTensor] = None + predicted_depth: torch.FloatTensor = None fov: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @add_start_docstrings( @@ -1502,41 +1546,26 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, ) - last_hidden_state = depth_pro_outputs.last_hidden_state - last_hidden_state = [proj(state) for proj, state in zip(self.projections, last_hidden_state)] - fused_state = self.fusion_stage(last_hidden_state) - predicted_depth = self.head(fused_state) + features = depth_pro_outputs.features + features = [proj(feature) for proj, feature in zip(self.projections, features)] + fused_features = self.fusion_stage(features) + predicted_depth = self.head(fused_features) - if self.use_fov_model: + fov = self.fov_model( + pixel_values=pixel_values, # use lowest scaled image features for fov model - global_features = last_hidden_state[0].detach() - fov_encodings = self.fov_model( - pixel_values=pixel_values, - global_features=global_features, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=True, - ) - fov = fov_encodings.last_hidden_state - attentions = depth_pro_outputs.attentions + fov_encodings.attentions if output_attentions else None - hidden_states = ( - depth_pro_outputs.hidden_states + fov_encodings.hidden_states if output_hidden_states else None - ) - else: - fov = None - attentions = depth_pro_outputs.attentions - hidden_states = depth_pro_outputs.hidden_states + global_features=features[0].detach(), + head_mask=head_mask, + ) if self.use_fov_model else None if not return_dict: - outputs = (predicted_depth, fov, hidden_states, attentions) - outputs = (i for i in outputs if i is not None) - return outputs + outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] + return tuple(v for v in outputs if v is not None) return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, fov=fov, - hidden_states=hidden_states, - attentions=attentions, + hidden_states=depth_pro_outputs.hidden_states, + attentions=depth_pro_outputs.attentions, ) diff --git a/tests/models/depth_pro/__init__.py b/tests/models/depth_pro/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py new file mode 100644 index 000000000000..3d37965dcd1b --- /dev/null +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -0,0 +1,335 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch DepthPro model.""" + +import unittest + +from transformers import DepthProConfig +from transformers.file_utils import is_torch_available, is_vision_available +from transformers.testing_utils import require_torch, require_vision, slow, torch_device + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + from torch import nn + + from transformers import DepthProForDepthEstimation, DepthProModel + from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES + + +if is_vision_available(): + from PIL import Image + + from transformers import DepthProImageProcessor + + +class DepthProModelTester: + def __init__( + self, + parent, + batch_size=8, + image_size=64, + patch_size=8, + patch_embeddings_size=4, + num_channels=3, + is_training=True, + use_labels=True, + hidden_size=32, + fusion_hidden_size=16, + intermediate_hook_ids=[1, 0], + intermediate_feature_dims=[8, 8], + scaled_images_ratios=[0.5, 1.0], + scaled_images_overlap_ratios=[0.0, 0.2], + scaled_images_feature_dims=[12, 12], + num_hidden_layers=2, + num_attention_heads=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + use_fov_model=True, + num_labels=3, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.patch_size = patch_size + self.patch_embeddings_size = patch_embeddings_size + self.num_channels = num_channels + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.fusion_hidden_size = fusion_hidden_size + self.intermediate_hook_ids = intermediate_hook_ids + self.intermediate_feature_dims = intermediate_feature_dims + self.scaled_images_ratios = scaled_images_ratios + self.scaled_images_overlap_ratios = scaled_images_overlap_ratios + self.scaled_images_feature_dims = scaled_images_feature_dims + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.use_fov_model = use_fov_model + self.num_labels = num_labels + + self.num_patches = (patch_size // patch_embeddings_size) ** 2 + self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + + return config, pixel_values, labels + + def get_config(self): + return DepthProConfig( + image_size=self.image_size, + patch_size=self.patch_size, + patch_embeddings_size=self.patch_embeddings_size, + num_channels=self.num_channels, + hidden_size=self.hidden_size, + fusion_hidden_size=self.fusion_hidden_size, + intermediate_hook_ids=self.intermediate_hook_ids, + intermediate_feature_dims=self.intermediate_feature_dims, + scaled_images_ratios=self.scaled_images_ratios, + scaled_images_overlap_ratios=self.scaled_images_overlap_ratios, + scaled_images_feature_dims=self.scaled_images_feature_dims, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + use_fov_model=self.use_fov_model, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = DepthProModel(config=config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_depth_estimation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = DepthProForDepthEstimation(config) + model.to(torch_device) + model.eval() + result = model(pixel_values) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + +@require_torch +class DepthProModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + """ + Here we also overwrite some of the tests of test_modeling_common.py, as DepthPro does not use input_ids, inputs_embeds, + attention_mask and seq_length. + """ + + all_model_classes = (DepthProModel, DepthProForDepthEstimation) if is_torch_available() else () + pipeline_model_mapping = ( + { + "depth-estimation": DepthProForDepthEstimation, + "image-feature-extraction": DepthProModel, + } + if is_torch_available() + else {} + ) + + test_pruning = False + test_resize_embeddings = False + test_head_masking = False + + def setUp(self): + self.model_tester = DepthProModelTester(self) + self.config_tester = ConfigTester(self, config_class=DepthProConfig, has_text_modality=False, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + @unittest.skip(reason="DepthPro does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + def test_model_get_set_embeddings(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_depth_estimation(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + + def test_training(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values(): + continue + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_training_gradient_checkpointing(self): + for model_class in self.all_model_classes: + if model_class.__name__ == "DepthProForDepthEstimation": + continue + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + + if model_class.__name__ in MODEL_MAPPING_NAMES.values() or not model_class.supports_gradient_checkpointing: + continue + model = model_class(config) + model.to(torch_device) + model.gradient_checkpointing_enable() + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + # Skip the check for the backbone + backbone_params = [] + for name, module in model.named_modules(): + if module.__class__.__name__ == "DepthProViTHybridEmbeddings": + backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] + break + + for name, param in model.named_parameters(): + if param.requires_grad: + if name in backbone_params: + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + @slow + def test_model_from_pretrained(self): + model_name = "Intel/depth_pro-large" + model = DepthProModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_torch +@require_vision +@slow +class DepthProModelIntegrationTest(unittest.TestCase): + def test_inference_depth_estimation(self): + image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") + model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device) + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt").to(torch_device) + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth + + # verify the predicted depth + expected_shape = torch.Size((1, 384, 384)) + self.assertEqual(predicted_depth.shape, expected_shape) + + expected_slice = torch.tensor( + [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) + + def test_post_processing_depth_estimation(self): + image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") + model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large") + + image = prepare_img() + inputs = image_processor(images=image, return_tensors="pt") + + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + + predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] + expected_shape = torch.Size((384, 384)) + self.assertTrue(predicted_depth.shape == expected_shape) + + predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) + predicted_depth_l = predicted_depth_l[0]["predicted_depth"] + expected_shape = torch.Size((500, 500)) + self.assertTrue(predicted_depth_l.shape == expected_shape) + + output_enlarged = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False + ).squeeze() + self.assertTrue(output_enlarged.shape == expected_shape) + self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3)) From 16a39178307e3d2b484fb0df44e3ff05e0b67aff Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 02:20:22 +0500 Subject: [PATCH 043/151] more fixes --- docs/source/en/model_doc/depth_pro.md | 19 +++++++------------ .../depth_pro/configuration_depth_pro.py | 10 ---------- .../models/depth_pro/modeling_depth_pro.py | 4 ++-- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 6472cc506dae..7e4ac13f1d64 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -91,17 +91,17 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DepthProConfig -## DepthProFeatureExtractor - -[[autodoc]] DepthProFeatureExtractor - - __call__ - - post_process_semantic_segmentation - ## DepthProImageProcessor [[autodoc]] DepthProImageProcessor - preprocess - - post_process_semantic_segmentation + - post_process_depth_estimation + +## DepthProImageProcessorFast + +[[autodoc]] DepthProImageProcessorFast + - preprocess + - post_process_depth_estimation ## DepthProModel @@ -112,8 +112,3 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] DepthProForDepthEstimation - forward - -## DepthProForSemanticSegmentation - -[[autodoc]] DepthProForSemanticSegmentation - - forward diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index beb3215d8ddf..46220a0731e6 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -72,12 +72,6 @@ class DepthProConfig(PretrainedConfig): Stochastic depth rate per sample (when applied in the main path of residual layers). use_swiglu_ffn (`bool`, *optional*, defaults to `False`): Whether to use the SwiGLU feedforward neural network. - apply_layernorm (`bool`, *optional*, defaults to `True`): - Whether to apply layer normalization to the feature maps in case the model is used as backbone. - reshape_hidden_states (`bool`, *optional*, defaults to `True`): - Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in - case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, - seq_len, hidden_size)`. intermediate_hook_ids (`List[int]`, *optional*, defaults to `[11, 5]`): Indices of the intermediate hidden states from the patch encoder to use for fusion. intermediate_feature_dims (`List[int]`, *optional*, defaults to `[256, 256]`): @@ -134,8 +128,6 @@ def __init__( layerscale_value=1.0, drop_path_rate=0.0, use_swiglu_ffn=False, - apply_layernorm=True, - reshape_hidden_states=True, intermediate_hook_ids=[11, 5], intermediate_feature_dims=[256, 256], scaled_images_ratios=[0.25, 0.5, 1], @@ -167,8 +159,6 @@ def __init__( self.layerscale_value = layerscale_value self.drop_path_rate = drop_path_rate self.use_swiglu_ffn = use_swiglu_ffn - self.apply_layernorm = apply_layernorm - self.reshape_hidden_states = reshape_hidden_states self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual self.use_bias_in_fusion_residual = use_bias_in_fusion_residual self.use_fov_model = use_fov_model diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 2e074588d4e3..27754c5dbafc 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -22,16 +22,16 @@ from torch import nn from ...activations import ACT2FN -from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput +from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( + ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, torch_int, - ModelOutput, ) from .configuration_depth_pro import DepthProConfig From 2408ec54e4f27d2abbecdb8374e58f34d91d8e96 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 3 Dec 2024 12:18:09 +0500 Subject: [PATCH 044/151] use output states from image_encoder instead --- .../models/depth_pro/modeling_depth_pro.py | 49 ++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 27754c5dbafc..00241bb86465 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = self.position_embeddings.shape[1] - 1 + num_positions = embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - sqrt_num_positions = torch_int(num_positions**0.5) - patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) + patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype @@ -734,6 +734,7 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.fusion_hidden_size = config.fusion_hidden_size + self.patch_size = config.patch_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -867,7 +868,7 @@ def forward( patch_encodings = self.patch_encoder( patches, head_mask=head_mask, - output_attentions=output_attentions, + output_attentions=False, output_hidden_states=True, # required for intermediate features return_dict=True, ) @@ -875,11 +876,18 @@ def forward( patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first + # scale the image to patch size for image_encoder + scaled_image_to_patch_size = nn.functional.interpolate( + pixel_values, + size=(self.patch_size, self.patch_size), + mode="bilinear", + align_corners=False, + ) image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=scaled_image_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, return_dict=True, ) @@ -946,19 +954,15 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + ) # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (B, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = merge( - image_features, - batch_size=B, - merge_out_size=self.out_size * 2 ** (0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + # no merge required for image_features as they are already in batches instead of patches # d. upsample image_features = self.upsample_image( @@ -980,20 +984,9 @@ def forward( # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = patch_encodings.last_hidden_state - hidden_states = patch_encodings.hidden_states if output_hidden_states else None - attentions = patch_encodings.attentions if output_attentions else None - - num_patches = sum(scaled_images_num_patches) - # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 - indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T - indexes = indexes.to(last_hidden_state.device) - - last_hidden_state = last_hidden_state[indexes].mean(1) - if hidden_states is not None: - hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) - if attentions is not None: - attentions = tuple([state[indexes].mean(1) for state in attentions]) + last_hidden_state = image_encodings.last_hidden_state + hidden_states = image_encodings.hidden_states if output_hidden_states else None + attentions = image_encodings.attentions if output_attentions else None if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) From be0c2a37478589c31d5b3864f16b955f952b43cd Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 09:13:25 +0500 Subject: [PATCH 045/151] Revert "use output states from image_encoder instead" This reverts commit 2408ec54e4f27d2abbecdb8374e58f34d91d8e96. --- .../models/depth_pro/modeling_depth_pro.py | 49 +++++++++++-------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 00241bb86465..27754c5dbafc 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -103,7 +103,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +117,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) - patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype @@ -734,7 +734,6 @@ def __init__(self, config: DepthProConfig) -> None: self.config = config self.hidden_size = config.hidden_size self.fusion_hidden_size = config.fusion_hidden_size - self.patch_size = config.patch_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -868,7 +867,7 @@ def forward( patch_encodings = self.patch_encoder( patches, head_mask=head_mask, - output_attentions=False, + output_attentions=output_attentions, output_hidden_states=True, # required for intermediate features return_dict=True, ) @@ -876,18 +875,11 @@ def forward( patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first - # scale the image to patch size for image_encoder - scaled_image_to_patch_size = nn.functional.interpolate( - pixel_values, - size=(self.patch_size, self.patch_size), - mode="bilinear", - align_corners=False, - ) image_encodings = self.image_encoder( - pixel_values=scaled_image_to_patch_size, + pixel_values=scaled_images[0], # provide least resolution image head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, return_dict=True, ) @@ -954,15 +946,19 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (B, config.hidden_size, self.out_size, self.out_size) + ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - # no merge required for image_features as they are already in batches instead of patches + image_features = merge( + image_features, + batch_size=B, + merge_out_size=self.out_size * 2 ** (0), + ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) # d. upsample image_features = self.upsample_image( @@ -984,9 +980,20 @@ def forward( # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = image_encodings.last_hidden_state - hidden_states = image_encodings.hidden_states if output_hidden_states else None - attentions = image_encodings.attentions if output_attentions else None + last_hidden_state = patch_encodings.last_hidden_state + hidden_states = patch_encodings.hidden_states if output_hidden_states else None + attentions = patch_encodings.attentions if output_attentions else None + + num_patches = sum(scaled_images_num_patches) + # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 + indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T + indexes = indexes.to(last_hidden_state.device) + + last_hidden_state = last_hidden_state[indexes].mean(1) + if hidden_states is not None: + hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) + if attentions is not None: + attentions = tuple([state[indexes].mean(1) for state in attentions]) if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) From efed39f86e629a56df892f45dcbb5d4dc05222a4 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 09:18:16 +0500 Subject: [PATCH 046/151] make embeddings dynamic --- src/transformers/models/depth_pro/modeling_depth_pro.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 27754c5dbafc..4f97f37230cb 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -80,6 +80,7 @@ class DepthProViTEmbeddings(nn.Module): """ Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Embeddings except antialias=True in interpolation and removal of mask_token + and enabling dynamic embeddings. """ def __init__(self, config: DepthProConfig) -> None: @@ -103,7 +104,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ - num_positions = self.position_embeddings.shape[1] - 1 + num_positions = embeddings.shape[1] - 1 # always interpolate when tracing to ensure the exported model works for dynamic input shapes if not torch.jit.is_tracing() and self.seq_len == num_positions and height == width: @@ -117,8 +118,8 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: new_height = height // self.config.patch_embeddings_size new_width = width // self.config.patch_embeddings_size - sqrt_num_positions = torch_int(num_positions**0.5) - patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed_size = torch_int(patch_pos_embed.shape[1] ** 0.5) + patch_pos_embed = patch_pos_embed.reshape(1, patch_pos_embed_size, patch_pos_embed_size, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype From c3b14fbcc54a1877bf6ebb7b7b61d9d67f1753ce Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 10:58:45 +0500 Subject: [PATCH 047/151] reshape output hidden states and attentions as part of computation graph --- .../models/depth_pro/modeling_depth_pro.py | 114 +++++++++++++----- .../depth_pro/test_modeling_depth_pro.py | 3 +- 2 files changed, 88 insertions(+), 29 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 4f97f37230cb..6f20838375cf 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -42,6 +42,25 @@ _CONFIG_FOR_DOC = "DepthProConfig" +def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor: + """ + converts tensor from shape: + (num_patches, seq_len, hidden_size) -> (batch_size, num_patches_per_batch, seq_len, hidden_size) + """ + data = data.reshape(-1, batch_size, *data.shape[1:]) + data = data.transpose(0, 1) + return data + +def batch_to_patch(data: torch.Tensor) -> torch.Tensor: + """ + converts tensor from shape: + (batch_size, num_patches_per_batch, seq_len, hidden_size) -> (num_patches, seq_len, hidden_size) + """ + data = data.transpose(0, 1) + data = data.reshape(-1, *data.shape[2:]) + return data + + class DepthProViTPatchEmbeddings(nn.Module): """ Copied from transformers.models.dinov2.modeling_dinov2.Dinov2PatchEmbeddings @@ -135,13 +154,17 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: return torch.cat((class_pos_embed, patch_pos_embed), dim=1) - def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: - batch_size, _, height, width = pixel_values.shape + def forward( + self, + pixel_values: torch.Tensor, + batch_size: Optional[int] = None, + ) -> torch.Tensor: + n, _, height, width = pixel_values.shape target_dtype = self.patch_embeddings.projection.weight.dtype embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) # add the [CLS] token to the embedded patch tokens - cls_tokens = self.cls_token.expand(batch_size, -1, -1) + cls_tokens = self.cls_token.expand(n, -1, -1) embeddings = torch.cat((cls_tokens, embeddings), dim=1) # add positional encoding to each token @@ -149,11 +172,14 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: embeddings = self.dropout(embeddings) + if batch_size is not None: + embeddings = patch_to_batch(embeddings, batch_size) + return embeddings -# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->DepthPro class DepthProViTSelfAttention(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.__init__ with ViT->DepthPro def __init__(self, config: DepthProConfig) -> None: super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): @@ -172,13 +198,20 @@ def __init__(self, config: DepthProConfig) -> None: self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention.transpose_for_scores with ViT->DepthPro def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) x = x.view(new_x_shape) return x.permute(0, 2, 1, 3) + # Taken from transformers.models.vit.modeling_vit.ViTSelfAttention.forward with ViT->DepthPro + # with the addition of `batch_size` def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: mixed_query_layer = self.query(hidden_states) @@ -202,25 +235,37 @@ def forward( if head_mask is not None: attention_probs = attention_probs * head_mask - context_layer = torch.matmul(attention_probs, value_layer) + if batch_size is not None: + attention_probs_batched = patch_to_batch(attention_probs, batch_size) + attention_probs_patched = batch_to_patch(attention_probs_batched) + else: + attention_probs_patched = attention_probs_batched = attention_probs + + context_layer = torch.matmul(attention_probs_patched, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(new_context_layer_shape) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + outputs = (context_layer, attention_probs_batched) if output_attentions else (context_layer,) return outputs -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention with Dinov2Config->DepthProConfig, Dinov2->DepthProViT class DepthProViTSdpaSelfAttention(DepthProViTSelfAttention): + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__(config) self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2SdpaSelfAttention.forward with Dinov2Config->DepthProConfig, Dinov2->DepthProViT + # with the addition of `batch_size` def forward( - self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + self, + hidden_states, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: if output_attentions: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. @@ -229,7 +274,7 @@ def forward( 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( - hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size, ) mixed_query_layer = self.query(hidden_states) @@ -274,14 +319,15 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTAttention(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTAttention.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__() self.attention = DepthProViTSelfAttention(config) self.output = DepthProViTSelfOutput(config) self.pruned_heads = set() + # Copied from transformers.models.vit.modeling_vit.ViTAttention.prune_heads def prune_heads(self, heads: Set[int]) -> None: if len(heads) == 0: return @@ -300,13 +346,16 @@ def prune_heads(self, heads: Set[int]) -> None: self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) + # Taken from transformers.models.vit.modeling_vit.ViTAttention.prune_heads + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: - self_outputs = self.attention(hidden_states, head_mask, output_attentions) + self_outputs = self.attention(hidden_states, head_mask, output_attentions, batch_size) attention_output = self.output(self_outputs[0], hidden_states) @@ -411,10 +460,10 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: } -# Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing class DepthProViTLayer(nn.Module): """This corresponds to the Block class in the original implementation.""" + # Copied from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.__init__ with Dinov2Config->DepthProConfig, Dinov2->DepthProViT all-casing def __init__(self, config: DepthProConfig) -> None: super().__init__() @@ -431,16 +480,23 @@ def __init__(self, config: DepthProConfig) -> None: self.mlp = DepthProViTMLP(config) self.layer_scale2 = DepthProViTLayerScale(config) + # Taken from transformers.models.dinov2.modeling_dinov2.Dinov2Layer.forward + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, + batch_size: Optional[int] = None, ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if batch_size is not None: + hidden_states = batch_to_patch(hidden_states) + self_attention_outputs = self.attention( self.norm1(hidden_states), # in DepthProViT, layernorm is applied before self-attention head_mask, output_attentions=output_attentions, + batch_size=batch_size, ) attention_output = self_attention_outputs[0] @@ -458,19 +514,24 @@ def forward( # second residual connection layer_output = self.drop_path(layer_output) + hidden_states + if batch_size is not None: + layer_output = patch_to_batch(layer_output, batch_size) + outputs = (layer_output,) + outputs return outputs -# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViTConfig->DepthProConfig, ViT->DepthProViT class DepthProViTEncoder(nn.Module): + # Copied from transformers.models.vit.modeling_vit.ViTEncoder.__init__ with ViTConfig->DepthProConfig, ViT->DepthProViT def __init__(self, config: DepthProConfig) -> None: super().__init__() self.config = config self.layer = nn.ModuleList([DepthProViTLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + # Taken from transformers.models.vit.modeling_vit.ViTEncoder.__init__ + # with the addition of `batch_size` def forward( self, hidden_states: torch.Tensor, @@ -478,6 +539,7 @@ def forward( output_attentions: bool = False, output_hidden_states: bool = False, return_dict: bool = True, + batch_size: Optional[int] = None, ) -> Union[tuple, BaseModelOutput]: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -494,9 +556,10 @@ def forward( hidden_states, layer_head_mask, output_attentions, + batch_size, ) else: - layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions, batch_size) hidden_states = layer_outputs[0] @@ -532,6 +595,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + batch_size: Optional[int] = None, ) -> Union[Tuple, BaseModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -542,7 +606,7 @@ def forward( if pixel_values is None: raise ValueError("You have to specify pixel_values") - embedding_output = self.embeddings(pixel_values) + embedding_output = self.embeddings(pixel_values, batch_size=batch_size) encoder_outputs = self.encoder( embedding_output, @@ -550,6 +614,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + batch_size=batch_size, ) sequence_output = encoder_outputs[0] sequence_output = self.layernorm(sequence_output) @@ -871,9 +936,12 @@ def forward( output_attentions=output_attentions, output_hidden_states=True, # required for intermediate features return_dict=True, + batch_size=B, ) + last_hidden_state = patch_encodings.last_hidden_state + last_hidden_state = batch_to_patch(last_hidden_state) scaled_images_last_hidden_state = torch.split_with_sizes( - patch_encodings.last_hidden_state, scaled_images_num_patches[::-1] + last_hidden_state, scaled_images_num_patches[::-1] )[::-1] # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( @@ -917,6 +985,7 @@ def forward( self.intermediate_hook_ids[i] + 1 ) # +1 to correct index position as hidden_states contain embedding output as well hidden_state = patch_encodings.hidden_states[layer_id] + hidden_state = batch_to_patch(hidden_state) hidden_state = hidden_state[ : scaled_images_num_patches[-1] ] # num_patches to be of same length as highest resolution @@ -985,17 +1054,6 @@ def forward( hidden_states = patch_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions if output_attentions else None - num_patches = sum(scaled_images_num_patches) - # [0, 3, 6], [1, 4, 7], [2, 5, 8] when num_patches=9 and B=3 - indexes = torch.arange(num_patches).reshape(num_patches//B, -1).T - indexes = indexes.to(last_hidden_state.device) - - last_hidden_state = last_hidden_state[indexes].mean(1) - if hidden_states is not None: - hidden_states = tuple([state[indexes].mean(1) for state in hidden_states]) - if attentions is not None: - attentions = tuple([state[indexes].mean(1) for state in attentions]) - if not return_dict: return tuple(v for v in [last_hidden_state, features, hidden_states, attentions] if v is not None) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 3d37965dcd1b..9e881cf273b7 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -131,7 +131,8 @@ def create_and_check_model(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)) def create_and_check_for_depth_estimation(self, config, pixel_values, labels): config.num_labels = self.num_labels From 7cf2485adef235b906b469a38002a8dacc3d0537 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 11:14:21 +0500 Subject: [PATCH 048/151] fix ruff formating --- .../depth_pro/image_processing_depth_pro.py | 18 ++++++---- .../models/depth_pro/modeling_depth_pro.py | 36 +++++++++++-------- .../depth_pro/test_modeling_depth_pro.py | 8 +++-- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 65a29900c637..164c7e28c6e2 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -157,13 +157,17 @@ def resize( raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") output_size = (size["height"], size["width"]) - return torch.nn.functional.interpolate( - # input should be (B, C, H, W) - input=torch.from_numpy(image).unsqueeze(0), - size=output_size, - mode=pil_torch_interpolation_mapping[resample].value, - antialias=antialias, - ).squeeze(0).numpy() + return ( + torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=torch.from_numpy(image).unsqueeze(0), + size=output_size, + mode=pil_torch_interpolation_mapping[resample].value, + antialias=antialias, + ) + .squeeze(0) + .numpy() + ) def _validate_input_arguments( self, diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 6f20838375cf..8fa286c70919 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -51,6 +51,7 @@ def patch_to_batch(data: torch.Tensor, batch_size: int) -> torch.Tensor: data = data.transpose(0, 1) return data + def batch_to_patch(data: torch.Tensor) -> torch.Tensor: """ converts tensor from shape: @@ -155,10 +156,10 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward( - self, - pixel_values: torch.Tensor, - batch_size: Optional[int] = None, - ) -> torch.Tensor: + self, + pixel_values: torch.Tensor, + batch_size: Optional[int] = None, + ) -> torch.Tensor: n, _, height, width = pixel_values.shape target_dtype = self.patch_embeddings.projection.weight.dtype embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype)) @@ -274,7 +275,10 @@ def forward( 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' ) return super().forward( - hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions, batch_size=batch_size, + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + batch_size=batch_size, ) mixed_query_layer = self.query(hidden_states) @@ -940,9 +944,9 @@ def forward( ) last_hidden_state = patch_encodings.last_hidden_state last_hidden_state = batch_to_patch(last_hidden_state) - scaled_images_last_hidden_state = torch.split_with_sizes( - last_hidden_state, scaled_images_num_patches[::-1] - )[::-1] # -1 as patch encoder expects high res patches first + scaled_images_last_hidden_state = torch.split_with_sizes(last_hidden_state, scaled_images_num_patches[::-1]) + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + # -1 as patch encoder expects high res patches first image_encodings = self.image_encoder( pixel_values=scaled_images[0], # provide least resolution image @@ -1610,12 +1614,16 @@ def forward( fused_features = self.fusion_stage(features) predicted_depth = self.head(fused_features) - fov = self.fov_model( - pixel_values=pixel_values, - # use lowest scaled image features for fov model - global_features=features[0].detach(), - head_mask=head_mask, - ) if self.use_fov_model else None + fov = ( + self.fov_model( + pixel_values=pixel_values, + # use lowest scaled image features for fov model + global_features=features[0].detach(), + head_mask=head_mask, + ) + if self.use_fov_model + else None + ) if not return_dict: outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions] diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 9e881cf273b7..e350b067a118 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -91,7 +91,7 @@ def __init__( self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 - self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -131,8 +131,10 @@ def create_and_check_model(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically - self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size)) + num_patches = result.last_hidden_state.shape[1] # num_patches are created dynamically + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, num_patches, self.seq_length, self.hidden_size) + ) def create_and_check_for_depth_estimation(self, config, pixel_values, labels): config.num_labels = self.num_labels From 0aa451df3e6862291d2097d5a1e6aa5e9aa91f23 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 22:41:11 +0500 Subject: [PATCH 049/151] fix docstring failure --- .../models/depth_pro/modeling_depth_pro.py | 16 +++++++++++++++- utils/check_docstrings.py | 1 - utils/check_repo.py | 1 - 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 8fa286c70919..1498ce4003d3 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1496,11 +1496,25 @@ class DepthProDepthEstimatorOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor, ...]] = None +DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`DepthProConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + use_fov_model (`bool`, *optional*, defaults to `True`): + Whether to use `DepthProFOVModel` to generate the field of view. +""" + + @add_start_docstrings( """ DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers). """, - DEPTH_PRO_START_DOCSTRING, + DEPTH_PRO_FOR_DEPTH_ESTIMATION_START_DOCSTRING, ) class DepthProForDepthEstimation(DepthProPreTrainedModel): def __init__(self, config, use_fov_model=None): diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 34deed0df47e..0be960f4a33e 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -140,7 +140,6 @@ "DPRReaderTokenizer", "DPRReaderTokenizerFast", "DPTModel", - "DepthProModel", "Data2VecAudioConfig", "Data2VecTextConfig", "Data2VecTextModel", diff --git a/utils/check_repo.py b/utils/check_repo.py index 2e131e879153..10be5cdcd262 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -213,7 +213,6 @@ "JukeboxPrior", "SamModel", "DPTForDepthEstimation", - "DepthProForDepthEstimation", "DecisionTransformerGPT2Model", "GLPNForDepthEstimation", "ViltForImagesAndTextClassification", From 160afbf57789906a134000a5b6ee99982cf4ae6f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 4 Dec 2024 23:32:39 +0500 Subject: [PATCH 050/151] use num_fov_head_layers in tests --- tests/models/depth_pro/test_modeling_depth_pro.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index e350b067a118..03f69e8ad1fe 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -64,6 +64,7 @@ def __init__( attention_probs_dropout_prob=0.1, initializer_range=0.02, use_fov_model=True, + num_fov_head_layers=0, num_labels=3, ): self.parent = parent @@ -88,6 +89,7 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.use_fov_model = use_fov_model + self.num_fov_head_layers = num_fov_head_layers self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 @@ -124,6 +126,7 @@ def get_config(self): attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, use_fov_model=self.use_fov_model, + num_fov_head_layers=self.num_fov_head_layers, ) def create_and_check_model(self, config, pixel_values, labels): From 9d2be2603d9a75346526b2a37711c6edc40125c8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:30:08 +0500 Subject: [PATCH 051/151] update doc --- docs/source/en/model_doc/depth_pro.md | 37 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 7e4ac13f1d64..041c4d49dffc 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -26,7 +26,7 @@ The abstract from the paper is the following: *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* -drawing DepthPro architecture. Taken from the original paper. @@ -38,16 +38,26 @@ This model was contributed by [geetu040](https://github.com/geetu040). The origi ## Usage tips ```python -from transformers import Dinov2Config, DepthProConfig, DepthProForDepthEstimation +from transformers import DepthProConfig, DepthProForDepthEstimation -# initialize with a Transformer-based backbone such as DINOv2 -# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width) -backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False) +config = DepthProConfig() +model = DPTForDepthEstimation(config=config) +``` + +- By default model takes an input image of size `1536`, this can be changed via config, however the model is compatible with images of different width and height. +- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, then each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`. +- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder. +- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config. +- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size. +- To generate `predicted_depth` of the same size as input image, make sure the config is created such that +``` +image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size -config = DepthProConfig(backbone_config=backbone_config) -model = DepthProForDepthEstimation(config=config) +where +n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) ``` + ### Using Scaled Dot Product Attention (SDPA) PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function @@ -59,9 +69,9 @@ page for more information. SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. -``` -from transformers import ViTForImageClassification -model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +```py +from transformers import DepthProForDepthEstimation +model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16) ... ``` @@ -78,12 +88,11 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` ## Resources -A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro. +- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073) -- Demo notebooks for [`DepthProForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DepthPro). +- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) -- [Semantic segmentation task guide](../tasks/semantic_segmentation) -- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation) + If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. From e208459cebe6b8f821aa14e0d9e7735466751daf Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:38:32 +0500 Subject: [PATCH 052/151] check consistency with config --- .../models/depth_pro/modeling_depth_pro.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 1498ce4003d3..605ea38ea736 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -838,6 +838,23 @@ def __init__(self, config: DepthProConfig) -> None: f"by patch_embeddings_size={config.patch_embeddings_size}." ) + # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent + if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)): + raise ValueError( + f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and " + f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and " + f"len(scaled_images_feature_dims)={len(config.scaled_images_feature_dims)}, " + f"should match in config." + ) + + # intermediate_hook_ids, intermediate_feature_dims are consistent + if not (len(config.intermediate_hook_ids) == len(config.intermediate_feature_dims)): + raise ValueError( + f"len(intermediate_hook_ids)={len(config.intermediate_hook_ids)} and " + f"len(intermediate_feature_dims)={len(config.intermediate_feature_dims)}, " + f"should match in config." + ) + # patch encoder self.patch_encoder = DepthProViT(config) From 0415722bd6dd44f4b7d56d0cacf8cdd3f958cb41 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 02:42:31 +0500 Subject: [PATCH 053/151] ruff formatting --- src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 605ea38ea736..040b9eb07962 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -839,7 +839,11 @@ def __init__(self, config: DepthProConfig) -> None: ) # scaled_images_ratios, scaled_images_overlap_ratios, scaled_images_feature_dims are consistent - if not (len(config.scaled_images_ratios) == len(config.scaled_images_overlap_ratios) == len(config.scaled_images_feature_dims)): + if not ( + len(config.scaled_images_ratios) + == len(config.scaled_images_overlap_ratios) + == len(config.scaled_images_feature_dims) + ): raise ValueError( f"len(scaled_images_ratios)={len(config.scaled_images_ratios)} and " f"len(scaled_images_overlap_ratios)={len(config.scaled_images_overlap_ratios)} and " From f4e7404191244a86a91d5e93c3be82ffa7d6b970 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 10:57:52 +0500 Subject: [PATCH 054/151] update test case --- tests/models/depth_pro/test_modeling_depth_pro.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 03f69e8ad1fe..54c5e870a258 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -52,12 +52,12 @@ def __init__( use_labels=True, hidden_size=32, fusion_hidden_size=16, - intermediate_hook_ids=[1, 0], - intermediate_feature_dims=[8, 8], + intermediate_hook_ids=[0], + intermediate_feature_dims=[8], scaled_images_ratios=[0.5, 1.0], scaled_images_overlap_ratios=[0.0, 0.2], scaled_images_feature_dims=[12, 12], - num_hidden_layers=2, + num_hidden_layers=1, num_attention_heads=4, hidden_act="gelu", hidden_dropout_prob=0.1, @@ -95,6 +95,9 @@ def __init__( self.num_patches = (patch_size // patch_embeddings_size) ** 2 self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token + n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) + self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size + def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -145,7 +148,7 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.image_size, self.image_size)) + self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() From 2c1cc10ee8ddefce3649dac81144e5095ee00ba8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 5 Dec 2024 14:55:06 +0500 Subject: [PATCH 055/151] fix ruff formatting --- tests/models/depth_pro/test_modeling_depth_pro.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 54c5e870a258..215756d45e99 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -96,7 +96,7 @@ def __init__( self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - self.expected_depth_size = 2**(n_fusion_blocks+1) * patch_size / patch_embeddings_size + self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -148,7 +148,9 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): model.to(torch_device) model.eval() result = model(pixel_values) - self.parent.assertEqual(result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size)) + self.parent.assertEqual( + result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size) + ) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() From 871b80db318a8e8b2b70533acd62cbcec678cc74 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 10:42:02 +0500 Subject: [PATCH 056/151] add tests for fov --- .../depth_pro/test_modeling_depth_pro.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 215756d45e99..48983c9aca3a 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -63,8 +63,7 @@ def __init__( hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, - use_fov_model=True, - num_fov_head_layers=0, + use_fov_model=False, num_labels=3, ): self.parent = parent @@ -89,7 +88,6 @@ def __init__( self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.use_fov_model = use_fov_model - self.num_fov_head_layers = num_fov_head_layers self.num_labels = num_labels self.num_patches = (patch_size // patch_embeddings_size) ** 2 @@ -129,7 +127,6 @@ def get_config(self): attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, use_fov_model=self.use_fov_model, - num_fov_head_layers=self.num_fov_head_layers, ) def create_and_check_model(self, config, pixel_values, labels): @@ -152,6 +149,36 @@ def create_and_check_for_depth_estimation(self, config, pixel_values, labels): result.predicted_depth.shape, (self.batch_size, self.expected_depth_size, self.expected_depth_size) ) + def create_and_check_for_fov(self, config, pixel_values, labels): + model = DepthProForDepthEstimation(config, use_fov_model=True) + model.to(torch_device) + model.eval() + + # check if the fov_model (DinoV2-based encoder) is created + self.parent.assertIsNotNone(model.fov_model) + + batched_pixel_values = pixel_values + row_pixel_values = pixel_values[:1] + + with torch.no_grad(): + model_batched_output_fov = model(batched_pixel_values).fov + model_row_output_fov = model(row_pixel_values).fov + + # check if fov is returned + self.parent.assertIsNotNone(model_batched_output_fov) + self.parent.assertIsNotNone(model_row_output_fov) + + # check output shape consistency for fov + self.parent.assertEqual(model_batched_output_fov.shape, (self.batch_size,)) + + # check equivalence between batched and single row outputs for fov + diff = torch.max(torch.abs(model_row_output_fov - model_batched_output_fov[:1])) + model_name = model.__class__.__name__ + self.parent.assertTrue( + diff <= 1e-03, + msg=(f"Batched and Single row outputs are not equal in {model_name} for fov. " f"Difference={diff}."), + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values, labels = config_and_inputs @@ -208,6 +235,10 @@ def test_for_depth_estimation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) + def test_for_fov(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_fov(*config_and_inputs) + def test_training(self): for model_class in self.all_model_classes: if model_class.__name__ == "DepthProForDepthEstimation": From 0ff06556163a39f90eede4d5e889554e46b9de46 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 15:11:06 +0500 Subject: [PATCH 057/151] use interpolation in postprocess --- .../models/depth_pro/image_processing_depth_pro.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 164c7e28c6e2..228c3d992457 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -393,10 +393,11 @@ def post_process_depth_estimation( outputs["fov"].append(fov) # interpolate - predicted_depth = self.resize( - predicted_depth.unsqueeze(0).unsqueeze(1), + predicted_depth = torch.nn.functional.interpolate( + # input should be (B, C, H, W) + input=predicted_depth.unsqueeze(0).unsqueeze(1), size=target_size, - resample=self.resample, + mode=pil_torch_interpolation_mapping[self.resample].value, antialias=self.antialias, ).squeeze() From befa6cdbca6194a4fab82c9865bfb9deeebe54c7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 6 Dec 2024 15:26:50 +0500 Subject: [PATCH 058/151] run and fix slow tests locally --- .../depth_pro/test_modeling_depth_pro.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 48983c9aca3a..a3026801d593 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -94,7 +94,7 @@ def __init__( self.seq_length = (patch_size // patch_embeddings_size) ** 2 + 1 # we add 1 for the [CLS] token n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) - self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size / patch_embeddings_size + self.expected_depth_size = 2 ** (n_fusion_blocks + 1) * patch_size // patch_embeddings_size def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @@ -313,8 +313,8 @@ def test_initialization(self): @slow def test_model_from_pretrained(self): - model_name = "Intel/depth_pro-large" - model = DepthProModel.from_pretrained(model_name) + model_path = "geetu040/DepthPro" + model = DepthProModel.from_pretrained(model_path) self.assertIsNotNone(model) @@ -329,8 +329,10 @@ def prepare_img(): @slow class DepthProModelIntegrationTest(unittest.TestCase): def test_inference_depth_estimation(self): - image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") - model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large").to(torch_device) + model_path = "geetu040/DepthPro" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) + config = model.config image = prepare_img() inputs = image_processor(images=image, return_tensors="pt").to(torch_device) @@ -341,18 +343,21 @@ def test_inference_depth_estimation(self): predicted_depth = outputs.predicted_depth # verify the predicted depth - expected_shape = torch.Size((1, 384, 384)) + n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) + expected_depth_size = 2 ** (n_fusion_blocks + 1) * config.patch_size // config.patch_embeddings_size + expected_shape = torch.Size((1, expected_depth_size, expected_depth_size)) self.assertEqual(predicted_depth.shape, expected_shape) expected_slice = torch.tensor( - [[6.3199, 6.3629, 6.4148], [6.3850, 6.3615, 6.4166], [6.3519, 6.3176, 6.3575]] + [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]] ).to(torch_device) self.assertTrue(torch.allclose(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4)) def test_post_processing_depth_estimation(self): - image_processor = DepthProImageProcessor.from_pretrained("Intel/depth_pro-large") - model = DepthProForDepthEstimation.from_pretrained("Intel/depth_pro-large") + model_path = "geetu040/DepthPro" + image_processor = DepthProImageProcessor.from_pretrained(model_path) + model = DepthProForDepthEstimation.from_pretrained(model_path) image = prepare_img() inputs = image_processor(images=image, return_tensors="pt") @@ -361,17 +366,15 @@ def test_post_processing_depth_estimation(self): with torch.no_grad(): outputs = model(**inputs) - predicted_depth = image_processor.post_process_depth_estimation(outputs=outputs)[0]["predicted_depth"] - expected_shape = torch.Size((384, 384)) - self.assertTrue(predicted_depth.shape == expected_shape) - - predicted_depth_l = image_processor.post_process_depth_estimation(outputs=outputs, target_sizes=[(500, 500)]) - predicted_depth_l = predicted_depth_l[0]["predicted_depth"] - expected_shape = torch.Size((500, 500)) - self.assertTrue(predicted_depth_l.shape == expected_shape) + predicted_depth = outputs.predicted_depth + fov = outputs.fov + target_size = [[image.height, image.width]] * len(predicted_depth) - output_enlarged = torch.nn.functional.interpolate( - predicted_depth.unsqueeze(0).unsqueeze(1), size=(500, 500), mode="bicubic", align_corners=False - ).squeeze() - self.assertTrue(output_enlarged.shape == expected_shape) - self.assertTrue(torch.allclose(predicted_depth_l, output_enlarged, rtol=1e-3)) + outputs = image_processor.post_process_depth_estimation( + predicted_depths=predicted_depth, + fovs=fov, + target_sizes=target_size, + ) + predicted_depth = outputs["predicted_depth"][0] + expected_shape = torch.Size((image.height, image.width)) + self.assertTrue(predicted_depth.shape == expected_shape) From 99ac5e81cc98b9297a81af784bf227179f1609e3 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 19:53:22 +0500 Subject: [PATCH 059/151] use scaled_images_features for image and fov encoder --- .../models/depth_pro/modeling_depth_pro.py | 80 ++++++++++--------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 040b9eb07962..f77e24925c88 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -959,7 +959,8 @@ def forward( patches, head_mask=head_mask, output_attentions=output_attentions, - output_hidden_states=True, # required for intermediate features + # required for intermediate features + output_hidden_states=self.n_intermediate_hooks or output_hidden_states, return_dict=True, batch_size=B, ) @@ -969,12 +970,16 @@ def forward( scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] # -1 as patch encoder expects high res patches first + # scale the image to patch size for image_encoder + image_scaled_to_patch_size = nn.functional.interpolate( + pixel_values, + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, + ) image_encodings = self.image_encoder( - pixel_values=scaled_images[0], # provide least resolution image + pixel_values=image_scaled_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, - return_dict=True, ) # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram @@ -1041,19 +1046,15 @@ def forward( # a. extract hidden_state hidden_state = ( image_encodings.last_hidden_state - ) # (scaled_images_num_patches[0], self.seq_len+1, config.hidden_size) + ) # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( hidden_state, self.out_size, self.out_size - ) # (scaled_images_num_patches[0], config.hidden_size, self.out_size, self.out_size) + ) # (B, config.hidden_size, self.out_size, self.out_size) # c. merge patches back together - image_features = merge( - image_features, - batch_size=B, - merge_out_size=self.out_size * 2 ** (0), - ) # (B, config.hidden_size, self.out_size*2**(self.n_scaled_images-1), self.out_size*2**(self.n_scaled_images-1)) + # no merge required for image_features as they are already in batches instead of patches # d. upsample image_features = self.upsample_image( @@ -1073,8 +1074,6 @@ def forward( *intermediate_features, ] - # prepare last_hidden_state, hidden_states, attentions from patches to batches - last_hidden_state = patch_encodings.last_hidden_state hidden_states = patch_encodings.hidden_states if output_hidden_states else None attentions = patch_encodings.attentions if output_attentions else None @@ -1420,35 +1419,42 @@ def forward( B, C, W, H = pixel_values.shape # follow the steps same as with image features in DepthProEncoder - pixel_values = interpolate( - pixel_values, - scale_factor=self.config.scaled_images_ratios[0], # same ratio as lowest ratioed image - ) - patches = patch( + # except for the extra encoder_neck layer applied + + image_scaled_to_patch_size = nn.functional.interpolate( pixel_values, - patch_size=self.config.patch_size, - overlap_ratio=self.config.scaled_images_overlap_ratios[0], + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, ) - encoder_outputs = self.encoder( - patches, + encodings = self.encoder( + image_scaled_to_patch_size, head_mask=head_mask, - output_attentions=False, - output_hidden_states=False, - return_dict=True, - ) - last_hidden_state = encoder_outputs.last_hidden_state - last_hidden_state = self.encoder_neck(last_hidden_state) - last_hidden_state = reshape_feature(last_hidden_state, width=self.out_size, height=self.out_size) - last_hidden_state = merge( - last_hidden_state, - batch_size=B, - merge_out_size=self.out_size, ) + # a. extract hidden_state + hidden_state = ( + encodings.last_hidden_state + ) # (B, self.seq_len+1, config.hidden_size) + # extra step + hidden_state = self.encoder_neck(hidden_state) + # (B, self.fusion_hidden_size//2, self.out_size, self.out_size) + + # b. reshape back to image like + fov_features = reshape_feature( + hidden_state, self.out_size, self.out_size + ) # (B, config.hidden_size, self.out_size, self.out_size) + + # c. merge patches back together + # no merge required for fov_features as they are already in batches instead of patches + + # d. upsample + # no upsampling required for fov_features, the head later downsamples to create scalars + global_features = self.global_neck(global_features) - last_hidden_state = last_hidden_state + global_features - fov_output = self.head(last_hidden_state) + fov_features = fov_features + global_features + fov_output = self.head(fov_features) fov_output = fov_output.reshape(B) return fov_output @@ -1652,7 +1658,7 @@ def forward( fov = ( self.fov_model( pixel_values=pixel_values, - # use lowest scaled image features for fov model + # frozon features from encoder are used global_features=features[0].detach(), head_mask=head_mask, ) From ebb62dd2190a164d8f4cfbb218cd7c2099515ae1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:28:32 +0500 Subject: [PATCH 060/151] return fused_hidden_states in fusion stage --- .../models/depth_pro/modeling_depth_pro.py | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f77e24925c88..91758a3db485 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -765,7 +765,6 @@ def merge(patches, batch_size, merge_out_size): boxes.append(boxes_in_row) boxes = torch.cat(boxes, dim=-2) - boxes = boxes[..., :merge_out_size, :merge_out_size] return boxes @@ -1303,7 +1302,7 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: # Taken from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer -# except it uses deconv, skip_add and avoids interpolation (it always receives consitent inputs) +# except it uses deconv annd skip_add class DepthProFeatureFusionLayer(nn.Module): def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: super().__init__() @@ -1328,6 +1327,10 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True) -> None: def forward(self, hidden_state, residual=None): if residual is not None: + if hidden_state.shape != residual.shape: + residual = nn.functional.interpolate( + residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False + ) hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual)) hidden_state = self.residual_layer2(hidden_state) @@ -1357,13 +1360,17 @@ def forward(self, hidden_states): f"doesnot match len(hidden_states)={len(hidden_states)}" ) - # first layer only uses the last hidden_state - fused_hidden_state = self.layers[0](hidden_states[0]) - # looping from the second layer to last layer - for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): - fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states = [] + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states, self.layers): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) - return fused_hidden_state + return fused_hidden_states class DepthProFOVModel(nn.Module): @@ -1652,8 +1659,8 @@ def forward( ) features = depth_pro_outputs.features features = [proj(feature) for proj, feature in zip(self.projections, features)] - fused_features = self.fusion_stage(features) - predicted_depth = self.head(fused_features) + fused_hidden_states = self.fusion_stage(features) + predicted_depth = self.head(fused_hidden_states[-1]) fov = ( self.fov_model( From 46c88e8bd3ba4dc2331b81fad1a54a4b902445e7 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:44:44 +0500 Subject: [PATCH 061/151] fix example --- .../models/depth_pro/modeling_depth_pro.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 91758a3db485..8f1609b6fb15 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1206,14 +1206,8 @@ def forward( >>> with torch.no_grad(): ... output = model(**inputs) ... - >>> for state in output.last_hidden_state: - ... print(state.shape) - ... - torch.Size([1, 1024, 48, 48]) - torch.Size([1, 1024, 96, 96]) - torch.Size([1, 512, 192, 192]) - torch.Size([1, 256, 384, 384]) - torch.Size([1, 256, 768, 768]) + >>> output.last_hidden_state.shape + torch.Size([1, 35, 577, 1024]) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( From 243135880028d09441fb41440f760a9a2c329a33 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 12 Dec 2024 20:48:36 +0500 Subject: [PATCH 062/151] fix ruff --- src/transformers/models/depth_pro/modeling_depth_pro.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 8f1609b6fb15..bd6c811a1163 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1043,9 +1043,7 @@ def forward( # STEP 6: get image features - (6) in diagram # a. extract hidden_state - hidden_state = ( - image_encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + hidden_state = image_encodings.last_hidden_state # (B, self.seq_len+1, config.hidden_size) # b. reshape back to image like image_features = reshape_feature( @@ -1434,9 +1432,7 @@ def forward( ) # a. extract hidden_state - hidden_state = ( - encodings.last_hidden_state - ) # (B, self.seq_len+1, config.hidden_size) + hidden_state = encodings.last_hidden_state # (B, self.seq_len+1, config.hidden_size) # extra step hidden_state = self.encoder_neck(hidden_state) # (B, self.fusion_hidden_size//2, self.out_size, self.out_size) From d9d3a49906bab33156ab97f8ebb7b2bd87d45a49 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 21 Dec 2024 10:23:09 +0500 Subject: [PATCH 063/151] fix copyright license for all files --- docs/source/en/model_doc/depth_pro.md | 2 +- src/transformers/models/depth_pro/__init__.py | 2 +- src/transformers/models/depth_pro/configuration_depth_pro.py | 2 +- .../models/depth_pro/convert_depth_pro_weights_to_hf.py | 2 +- src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +- .../models/depth_pro/image_processing_depth_pro_fast.py | 2 +- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- tests/models/depth_pro/test_image_processing_depth_pro.py | 2 +- tests/models/depth_pro/test_modeling_depth_pro.py | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 041c4d49dffc..9019547434af 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -1,4 +1,4 @@ - -## Usage tips +Here's an improved version of your documentation with enhanced clarity, formatting, and structure for easier understanding: +--- + +## **Usage Tips** + +Initialize the Model with Default Configuration ```python -from transformers import DepthProConfig, DepthProForDepthEstimation +from transformers import DepthProConfig, DepthProModel config = DepthProConfig() -model = DepthProForDepthEstimation(config=config) +model = DepthProModel(config=config) ``` -- Input image is scaled with different ratios, as specified in `scaled_images_ratios`, and each of the scaled image is patched to `patch_size` with an overlap ratio of `scaled_images_overlap_ratios`. -- These patches go through `DinoV2 (ViT)` based encoders and are reassembled via a `DPT` based decoder. -- `DepthProForDepthEstimation` can also predict the `FOV (Field of View)` if `use_fov_model` is set to `True` in the config. -- `DepthProImageProcessor` can be used for preprocessing the inputs and postprocessing the outputs. `DepthProImageProcessor.post_process_depth_estimation` interpolates the `predicted_depth` back to match the input image size. -- To generate `predicted_depth` of the same size as input image, make sure the config is created such that -``` -image_size / 2**(n_fusion_blocks+1) == patch_size / patch_embeddings_size +Load a Pre-Trained Model for Depth Estimation +```python +from transformers import DepthProConfig, DepthProForDepthEstimation -where -n_fusion_blocks = len(intermediate_hook_ids) + len(scaled_images_ratios) +checkpoint = "geetu040/DepthPro" +model = DepthProForDepthEstimation.from_pretrained(checkpoint) +config = model.config ``` +Key Features and Configuration Details + +1. Dual-Encoder Architecture: + - The `DepthProModel` uses **two encoders**: + - **`image_encoder`** and **`patch_encoder`**, which can be configured via `image_model_config` and `patch_model_config` in the configuration. + - By default, and in the pre-trained model, both encoders use the **`Dinov2Model`** architecture. + +2. Image Scaling and Patch Processing: + - Input images are scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration. + - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`. + - These patches are processed by the **`patch_encoder`**, while the image is also rescaled to `patch_size` and is processed by the **`image_encoder`**. + - Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation. + +3. Optional Field of View (FOV) Prediction: + - If `use_fov_model` is set to `True` in the configuration, the model predicts the **Field of View (FOV)** using a third encoder. + - This encoder also scales the image to `patch_size` and uses its `last_hidden_state` for FOV prediction. The encoder can be specified in the configuration using `fov_model_config`. + +4. Configuration and Validation: + - All encoders receive input images of size `patch_size`. + - The `image_size` for each encoder in the configuration should match the `patch_size`. This is validated when creating a `DepthProConfig`. + +5. Preprocessing and Postprocessing: + - Use the `DepthProImageProcessor` for preparing inputs and processing outputs: + - **Preprocessing**: Prepare images (rescale, normalize, resize) for model input. + - **Postprocessing**: Use `DepthProImageProcessor.post_process_depth_estimation` to interpolate the predicted depth to match the original input image size. + +6. Support for Variable Resolution and Aspect Ratios: + - The `DepthProModel` can process images with different resolutions and aspect ratios. However, for generating predicted depths that match the input image size, ensure the configuration satisfies: + ```py + input_image_size / 2**(n_fusion_blocks + 1) == image_model_config.image_size / image_model_config.patch_size + ``` + + - **Where**: + - `input_image_size`: The size of the input image. + - `image_model_config.image_size`: Image size for **`image_encoder`** which equals to `patch_size` in `DepthProConfig`. + - `n_fusion_blocks`: Total fusion blocks, calculated as: + ```py + len(intermediate_hook_ids) + len(scaled_images_ratios) + ``` + +### **Customizing Encoders in `DepthProModel`** + +The `DepthProModel` architecture uses **three encoders**, each responsible for a specific task: + +1. **Patch Encoder**: Processes image patches created by splitting the input image. +2. **Image Encoder**: Processes the input image resized to `patch_size`. +3. **FOV (Field of View) Encoder**: Generates the Field of View (FOV), if `use_fov_model` is enabled. + +You can configure each encoder to use any compatible model architecture. For example, to use: +- **`ViT` (Vision Transformer)** as the **patch encoder**, and +- **`BEiT`** as the **image encoder**, and +- **`DinoV2`** as the **FOV encoder**. + +```python +from transformers import DepthProConfig, DepthProForDepthEstimation + +config = DepthProConfig( + patch_model_config={ + "model_type": "vit", + "num_hidden_layers": 6, + "patch_size": 16, + "hidden_size": 512, + "num_attention_heads": 16, + "image_size": 384, # matches `patch_size` + }, + image_model_config={ + "model_type": "beit", + "num_hidden_layers": 4, + "patch_size": 8, + "hidden_size": 256, + "num_attention_heads": 8, + "image_size": 384, # matches `patch_size` + }, + fov_model_config={ + "model_type": "dinov2", + "num_hidden_layers": 4, + "patch_size": 8, + "hidden_size": 256, + "num_attention_heads": 8, + "image_size": 384, # matches `patch_size` + }, + patch_size=384, + # uses layers from the patch encoder + intermediate_hook_ids=[5, 1], + use_fov_model=True, +) +model = DepthProForDepthEstimation(config) +``` ### Using Scaled Dot Product Attention (SDPA) @@ -87,8 +177,10 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` ## Resources - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073) - - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) +- DepthPro for Super Resolution and Image Segmentation + - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba) + - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth) From 1a2dd3af42495a250b22fead79fefca4ec283634 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 14:13:30 +0500 Subject: [PATCH 104/151] include fov in integraiton tests --- tests/models/depth_pro/test_modeling_depth_pro.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 0858c8c2a0e6..4347d507ae34 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -346,20 +346,26 @@ def test_inference_depth_estimation(self): # forward pass with torch.no_grad(): outputs = model(**inputs) - predicted_depth = outputs.predicted_depth # verify the predicted depth n_fusion_blocks = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) out_size = config.image_model_config.image_size // config.image_model_config.patch_size expected_depth_size = 2 ** (n_fusion_blocks + 1) * out_size + expected_shape = torch.Size((1, expected_depth_size, expected_depth_size)) - self.assertEqual(predicted_depth.shape, expected_shape) + self.assertEqual(outputs.predicted_depth.shape, expected_shape) expected_slice = torch.tensor( [[1.0582, 1.1225, 1.1335], [1.1154, 1.1398, 1.1486], [1.1434, 1.1500, 1.1643]] ).to(torch_device) + torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=1e-4) + + # verify the predicted fov + expected_shape = torch.Size((1,)) + self.assertEqual(outputs.fov.shape, expected_shape) - torch.testing.assert_close(outputs.predicted_depth[0, :3, :3], expected_slice, atol=1e-4, rtol=0) + expected_slice = torch.tensor([47.2459]).to(torch_device) + torch.testing.assert_close(outputs.fov, expected_slice, atol=1e-4, rtol=1e-4) def test_post_processing_depth_estimation(self): model_path = "geetu040/DepthPro" From 4cfebaebfc34cf5b16933010f46ba51c42710c0d Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 19:29:44 +0500 Subject: [PATCH 105/151] update docs --- docs/source/en/model_doc/depth_pro.md | 186 ++++++++++++-------------- 1 file changed, 83 insertions(+), 103 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 2b74557b41b1..20b526dda76e 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -20,133 +20,112 @@ rendered properly in your Markdown viewer. The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun. -It leverages a multi-scale [Vision Transformer (ViT)](vit) optimized for dense predictions. It downsamples an image at several scales. At each scale, it is split into patches, which are processed by a ViT-based [Dinov2](dinov2) patch encoder, with weights shared across scales. Patches are merged into feature maps, upsampled, and fused via a [DPT](dpt)-like decoder. +DepthPro is a foundation model for zero-shot metric monocular depth estimation, designed to generate high-resolution depth maps with remarkable sharpness and fine-grained details. It employs a multi-scale Vision Transformer (ViT)-based architecture, where images are downsampled, divided into patches, and processed using a shared Dinov2 encoder. The extracted patch-level features are merged, upsampled, and refined using a DPT-like fusion stage, enabling precise depth estimation. The abstract from the paper is the following: *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* -drawing - DepthPro architecture. Taken from the original paper. + DepthPro Outputs. Taken from the official code. This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro). - +## Usage Tips + +The DepthPro model processes an input image by first downsampling it at multiple scales and splitting each scaled version into patches. These patches are then encoded using a shared Vision Transformer (ViT)-based Dinov2 patch encoder, while the full image is processed by a separate image encoder. The extracted patch features are merged into feature maps, upsampled, and fused using a DPT-like decoder to generate the final depth estimation. If enabled, an additional Field of View (FOV) encoder processes the image for estimating the camera's field of view, aiding in depth accuracy. -Here's an improved version of your documentation with enhanced clarity, formatting, and structure for easier understanding: +```py +>>> import requests +>>> from PIL import Image +>>> import torch +>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation ---- +>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +>>> image = Image.open(requests.get(url, stream=True).raw) -## **Usage Tips** +>>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro") +>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro") -Initialize the Model with Default Configuration -```python -from transformers import DepthProConfig, DepthProModel +>>> inputs = image_processor(images=image, return_tensors="pt") -config = DepthProConfig() -model = DepthProModel(config=config) -``` +>>> with torch.no_grad(): +... outputs = model(**inputs) -Load a Pre-Trained Model for Depth Estimation -```python -from transformers import DepthProConfig, DepthProForDepthEstimation +>>> post_processed_output = image_processor.post_process_depth_estimation( +... outputs, target_sizes=[(image.height, image.width)], +... ) -checkpoint = "geetu040/DepthPro" -model = DepthProForDepthEstimation.from_pretrained(checkpoint) -config = model.config +>>> fov = post_processed_output[0]["fov"] +>>> depth = post_processed_output[0]["predicted_depth"] +>>> depth = (depth - depth.min()) / depth.max() +>>> depth = depth * 255. +>>> depth = depth.detach().cpu().numpy() +>>> depth = Image.fromarray(depth.astype("uint8")) ``` -Key Features and Configuration Details +### Architecture and Configuration -1. Dual-Encoder Architecture: - - The `DepthProModel` uses **two encoders**: - - **`image_encoder`** and **`patch_encoder`**, which can be configured via `image_model_config` and `patch_model_config` in the configuration. - - By default, and in the pre-trained model, both encoders use the **`Dinov2Model`** architecture. + + + DepthPro architecture. Taken from the original paper. -2. Image Scaling and Patch Processing: - - Input images are scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration. +The `DepthProForDepthEstimation` model uses a `DepthProEncoder`, for encoding the input image and a `FeatureFusionStage` for fusing the output features from encoder. + +The `DepthProEncoder` further uses two encoders: +- `patch_encoder` + - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration. - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`. - - These patches are processed by the **`patch_encoder`**, while the image is also rescaled to `patch_size` and is processed by the **`image_encoder`**. - - Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation. - -3. Optional Field of View (FOV) Prediction: - - If `use_fov_model` is set to `True` in the configuration, the model predicts the **Field of View (FOV)** using a third encoder. - - This encoder also scales the image to `patch_size` and uses its `last_hidden_state` for FOV prediction. The encoder can be specified in the configuration using `fov_model_config`. - -4. Configuration and Validation: - - All encoders receive input images of size `patch_size`. - - The `image_size` for each encoder in the configuration should match the `patch_size`. This is validated when creating a `DepthProConfig`. - -5. Preprocessing and Postprocessing: - - Use the `DepthProImageProcessor` for preparing inputs and processing outputs: - - **Preprocessing**: Prepare images (rescale, normalize, resize) for model input. - - **Postprocessing**: Use `DepthProImageProcessor.post_process_depth_estimation` to interpolate the predicted depth to match the original input image size. - -6. Support for Variable Resolution and Aspect Ratios: - - The `DepthProModel` can process images with different resolutions and aspect ratios. However, for generating predicted depths that match the input image size, ensure the configuration satisfies: - ```py - input_image_size / 2**(n_fusion_blocks + 1) == image_model_config.image_size / image_model_config.patch_size - ``` - - - **Where**: - - `input_image_size`: The size of the input image. - - `image_model_config.image_size`: Image size for **`image_encoder`** which equals to `patch_size` in `DepthProConfig`. - - `n_fusion_blocks`: Total fusion blocks, calculated as: - ```py - len(intermediate_hook_ids) + len(scaled_images_ratios) - ``` - -### **Customizing Encoders in `DepthProModel`** - -The `DepthProModel` architecture uses **three encoders**, each responsible for a specific task: - -1. **Patch Encoder**: Processes image patches created by splitting the input image. -2. **Image Encoder**: Processes the input image resized to `patch_size`. -3. **FOV (Field of View) Encoder**: Generates the Field of View (FOV), if `use_fov_model` is enabled. - -You can configure each encoder to use any compatible model architecture. For example, to use: -- **`ViT` (Vision Transformer)** as the **patch encoder**, and -- **`BEiT`** as the **image encoder**, and -- **`DinoV2`** as the **FOV encoder**. - -```python -from transformers import DepthProConfig, DepthProForDepthEstimation - -config = DepthProConfig( - patch_model_config={ - "model_type": "vit", - "num_hidden_layers": 6, - "patch_size": 16, - "hidden_size": 512, - "num_attention_heads": 16, - "image_size": 384, # matches `patch_size` - }, - image_model_config={ - "model_type": "beit", - "num_hidden_layers": 4, - "patch_size": 8, - "hidden_size": 256, - "num_attention_heads": 8, - "image_size": 384, # matches `patch_size` - }, - fov_model_config={ - "model_type": "dinov2", - "num_hidden_layers": 4, - "patch_size": 8, - "hidden_size": 256, - "num_attention_heads": 8, - "image_size": 384, # matches `patch_size` - }, - patch_size=384, - # uses layers from the patch encoder - intermediate_hook_ids=[5, 1], - use_fov_model=True, -) -model = DepthProForDepthEstimation(config) + - These patches are processed by the **`patch_encoder`** +- `image_encoder` + - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`** + +Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are seperate `Dinov2Model` by default. + +Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation. + +### Field-of-View (FOV) Prediction + +The network is supplemented with a focal length estimation head. A small convolutional head ingests frozen features from the depth estimation network and task-specific features from a separate ViT image encoder to predict the horizontal angular field-of-view. + +The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model. + +The pretrained model at checkpoint `geetu040/DepthPro` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. +```py +>>> from transformers import DepthProForDepthEstimation +>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", use_fov_model=False) ``` +To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config. +```py +>>> from transformers import DepthProConfig, DepthProForDepthEstimation +>>> config = DepthProConfig(use_fov_model=True) +>>> model = DepthProForDepthEstimation(config) +``` + +Or set `use_fov_model=True` when initializing the model, which overrides the value in config. +```py +>>> from transformers import DepthProConfig, DepthProForDepthEstimation +>>> config = DepthProConfig() +>>> model = DepthProForDepthEstimation(config, use_fov_model=True) +``` + +### Image Resolution and Aspect Ratio + +The network can process images of different resolutions and aspect ratios and the predicted depth size can be calculated using the following formula: + +$\text{Predicted Depth Size} = \frac{2^{N+1} \cdot S}{P}$ + +Where: +- $N = \text{len}(\text{intermediate\_hook\_ids}) + \text{len}(\text{scaled\_images\_ratios})$ +- $S = \text{image\_model\_config.image\_size}$ +- $P = \text{image\_model\_config.patch\_size}$ + +The aspect ratio of the raw predicted depth is maintained as the aspect ratio of the input image. + ### Using Scaled Dot Product Attention (SDPA) PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function @@ -178,6 +157,7 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073) - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) +- DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb) - DepthPro for Super Resolution and Image Segmentation - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba) - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth) From 90627677b2d2e309c26cfd3a3e2e6dfa4acf868b Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 19:39:02 +0500 Subject: [PATCH 106/151] improve initialization of convolution layers --- src/transformers/models/depth_pro/modeling_depth_pro.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index eabcbe990dbf..cc636e4d494a 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -635,7 +635,7 @@ class DepthProPreTrainedModel(PreTrainedModel): def _init_weights(self, module): """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) @@ -644,6 +644,10 @@ def _init_weights(self, module): elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) + elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): + nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu') + if module.bias is not None: + module.bias.data.zero_() @add_start_docstrings( From fcba6bd16b5cb884f1414ff42ac611a9b417a719 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 19:43:29 +0500 Subject: [PATCH 107/151] fix unused fov keys --- src/transformers/models/depth_pro/modeling_depth_pro.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index cc636e4d494a..a5ff0c48c058 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -632,6 +632,7 @@ class DepthProPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _supports_sdpa = True _no_split_modules = [] + _keys_to_ignore_on_load_unexpected = ['fov_model.*'] def _init_weights(self, module): """Initialize the weights""" From 56cd570cfc346c44e4609978c7c6527b9d67b4c5 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 20:42:54 +0500 Subject: [PATCH 108/151] update tests --- .../test_image_processing_depth_pro.py | 6 +++- .../depth_pro/test_modeling_depth_pro.py | 33 +++++++++++-------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index e9d94151e145..de2f09063a67 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -17,7 +17,7 @@ import unittest from transformers.file_utils import is_vision_available -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import is_flaky, require_torch, require_vision from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -111,3 +111,7 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + @is_flaky(max_attempts=5, description="fast and slow, both use torch implementation, see: https://github.com/huggingface/transformers/issues/34920") + def test_fast_is_faster_than_slow(self): + super().test_fast_is_faster_than_slow() diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 4347d507ae34..a89f6a1195b6 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -300,22 +300,27 @@ def test_initialization(self): configs_no_init = _config_zero_init(config) for model_class in self.all_model_classes: model = model_class(config=configs_no_init) - # Skip the check for the backbone - backbone_params = [] - for name, module in model.named_modules(): - if module.__class__.__name__ == "DepthProViTHybridEmbeddings": - backbone_params = [f"{name}.{key}" for key in module.state_dict().keys()] - break - for name, param in model.named_parameters(): + non_uniform_init_parms = [ + # these encoders are vision transformers + # any layer outside these encoders is either Conv2d or ConvTranspose2d + # which use kaiming initialization + "patch_encoder", + "image_encoder", + "fov_model.encoder", + ] if param.requires_grad: - if name in backbone_params: - continue - self.assertIn( - ((param.data.mean() * 1e9).round() / 1e9).item(), - [0.0, 1.0], - msg=f"Parameter {name} of model {model_class} seems not properly initialized", - ) + if any(x in name for x in non_uniform_init_parms): + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertTrue( + -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) @slow def test_model_from_pretrained(self): From 26b1391d0138275895d86da08dd97ceb435dcf3e Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 30 Jan 2025 21:10:11 +0500 Subject: [PATCH 109/151] ruff format --- src/transformers/models/depth_pro/modeling_depth_pro.py | 4 ++-- tests/models/depth_pro/test_image_processing_depth_pro.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index a5ff0c48c058..7e36d3f8f306 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -632,7 +632,7 @@ class DepthProPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _supports_sdpa = True _no_split_modules = [] - _keys_to_ignore_on_load_unexpected = ['fov_model.*'] + _keys_to_ignore_on_load_unexpected = ["fov_model.*"] def _init_weights(self, module): """Initialize the weights""" @@ -646,7 +646,7 @@ def _init_weights(self, module): module.bias.data.zero_() module.weight.data.fill_(1.0) elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)): - nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") if module.bias is not None: module.bias.data.zero_() diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index de2f09063a67..0e830698c0a1 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -112,6 +112,8 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) - @is_flaky(max_attempts=5, description="fast and slow, both use torch implementation, see: https://github.com/huggingface/transformers/issues/34920") + @is_flaky( + description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920", + ) def test_fast_is_faster_than_slow(self): super().test_fast_is_faster_than_slow() From 01247f8e29e83be11a5d7e92aa37673a205ae1fe Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 31 Jan 2025 10:12:26 +0500 Subject: [PATCH 110/151] fix test, amid kaimming initialization --- tests/models/depth_pro/test_modeling_depth_pro.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index a89f6a1195b6..2f728ada14df 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -45,7 +45,7 @@ def __init__( parent, batch_size=8, image_size=64, - patch_size=8, + patch_size=16, num_channels=3, is_training=True, use_labels=True, @@ -322,6 +322,11 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) + # this started when switched from normal initialization to kaiming_normal intialization + # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers + def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): + super().test_batching_equivalence(atol=atol, rtol=rtol) + @slow def test_model_from_pretrained(self): model_path = "geetu040/DepthPro" From 0b7e77fbb750a0fce386186f461f5dd564f498e2 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 31 Jan 2025 20:54:12 +0500 Subject: [PATCH 111/151] add depthpro to toctree --- docs/source/en/_toctree.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 1e02abd1bb76..ff3359628de8 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -651,6 +651,8 @@ title: Depth Anything - local: model_doc/depth_anything_v2 title: Depth Anything V2 + - local: model_doc/depth_pro + title: DepthPro - local: model_doc/deta title: DETA - local: model_doc/detr From 20b277de61c908b05a62c9d0be14c4899b2fee90 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 31 Jan 2025 21:04:58 +0500 Subject: [PATCH 112/151] add residual layer to _no_split_modules --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 7e36d3f8f306..85a866e860d7 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -631,7 +631,7 @@ class DepthProPreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" supports_gradient_checkpointing = True _supports_sdpa = True - _no_split_modules = [] + _no_split_modules = ["DepthProPreActResidualLayer"] _keys_to_ignore_on_load_unexpected = ["fov_model.*"] def _init_weights(self, module): From ff0e408cb27446e7977e2ffde01bf4d2655063e8 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 1 Feb 2025 18:15:05 +0500 Subject: [PATCH 113/151] architecture rework --- .../convert_depth_pro_weights_to_hf.py | 51 +- .../models/depth_pro/modeling_depth_pro.py | 623 ++++++++++-------- 2 files changed, 384 insertions(+), 290 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index e17e53efe8cc..bddc3114ffec 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -32,44 +32,44 @@ ORIGINAL_TO_CONVERTED_KEY_MAPPING = { # encoder and head - r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.embeddings.cls_token", - r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.embeddings.position_embeddings", - r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.embeddings.patch_embeddings.projection.\2", - r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.norm\3.\4", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.attention.(query|key|value).\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.attention.output.dense.\3", - r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.layer_scale\3.lambda1", - r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.encoder.layer.\2.mlp.fc\3.\4", - r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.layernorm.\2", - r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.encoder.fuse_image_with_low_res.\1", + r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token", + r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings", + r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2", + r"encoder.(patch|image)_encoder.blocks.(\d+).norm(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.norm\3.\4", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.qkv.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.attention.(query|key|value).\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).attn.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.attention.output.dense.\3", + r"encoder.(patch|image)_encoder.blocks.(\d+).ls(\d+).gamma": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.layer_scale\3.lambda1", + r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4", + r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.layernorm.\2", + r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.neck.fuse_image_with_low_res.\1", r"head.(\d+).(weight|bias)": r"head.head.\1.\2", # fov - r"fov.encoder.0.cls_token": r"fov_model.encoder.embeddings.cls_token", - r"fov.encoder.0.pos_embed": r"fov_model.encoder.embeddings.position_embeddings", - r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.encoder.embeddings.patch_embeddings.projection.\1", - r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.norm\2.\3", - r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.attention.(query|key|value).\2", - r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.encoder.encoder.layer.\1.attention.output.dense.\2", - r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.encoder.encoder.layer.\1.layer_scale\2.lambda1", - r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.encoder.encoder.layer.\1.mlp.fc\2.\3", - r"fov.encoder.0.norm.(weight|bias)": r"fov_model.encoder.layernorm.\1", + r"fov.encoder.0.cls_token": r"fov_model.fov_encoder.model.embeddings.cls_token", + r"fov.encoder.0.pos_embed": r"fov_model.fov_encoder.model.embeddings.position_embeddings", + r"fov.encoder.0.patch_embed.proj.(weight|bias)": r"fov_model.fov_encoder.model.embeddings.patch_embeddings.projection.\1", + r"fov.encoder.0.blocks.(\d+).norm(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.norm\2.\3", + r"fov.encoder.0.blocks.(\d+).attn.qkv.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.attention.(query|key|value).\2", + r"fov.encoder.0.blocks.(\d+).attn.proj.(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.attention.output.dense.\2", + r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1", + r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3", + r"fov.encoder.0.norm.(weight|bias)": r"fov_model.fov_encoder.model.layernorm.\1", r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", - r"fov.encoder.1.(weight|bias)": r"fov_model.encoder_neck.\1", - r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.\1.\2", + r"fov.encoder.1.(weight|bias)": r"fov_model.fov_encoder.neck.\1", + r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", # upsamples - r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.encoder.feature_upsample.image_block.layers.0.\1", + r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.neck.feature_upsample.image_block.layers.0.\1", r"encoder.upsample_latent(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.encoder.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" + f"depth_pro.neck.feature_upsample.intermediate.{1-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" ), r"encoder.upsample(\d+).(\d+).(weight|bias)": lambda match: ( - f"depth_pro.encoder.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" + f"depth_pro.neck.feature_upsample.scaled_images.{2-int(match.group(1))}.layers.{match.group(2)}.{match.group(3)}" ), # projections between encoder and fusion r"decoder.convs.(\d+).weight": lambda match: ( - f"depth_pro.encoder.feature_projection.projections.{4-int(match.group(1))}.weight" + f"depth_pro.neck.feature_projection.projections.{4-int(match.group(1))}.weight" ), # fusion stage @@ -160,7 +160,6 @@ def write_model( # download and load state_dict from hf repo file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") - # file_path = "/home/geetu/work/hf/depth_pro/depth_pro.pt" # when you already have the files locally loaded = torch.load(file_path, weights_only=True) # ensure state_dict is in float32 diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 85a866e860d7..b9ddc579ef36 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -47,8 +47,8 @@ class DepthProOutput(ModelOutput): Args: last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - features (`List[torch.FloatTensor]`, *optional*: - Features from scaled images and hidden_states. + features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*): + Features from encoders. Can be a single feature or a list of features. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`. @@ -63,7 +63,7 @@ class DepthProOutput(ModelOutput): """ last_hidden_state: torch.FloatTensor = None - features: Optional[List[torch.FloatTensor]] = None + features: Union[torch.FloatTensor, List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -100,138 +100,6 @@ class DepthProDepthEstimatorOutput(ModelOutput): attentions: Optional[Tuple[torch.FloatTensor, ...]] = None -class DepthProFeatureUpsampleBlock(nn.Module): - def __init__( - self, - config: DepthProConfig, - input_dims: int, - intermediate_dims: int, - output_dims: int, - n_upsample_layers: int, - use_proj: bool = True, - bias: bool = False, - ): - super().__init__() - self.config = config - self.layers = nn.ModuleList() - - # create first projection layer - if use_proj: - proj = nn.Conv2d( - in_channels=input_dims, - out_channels=intermediate_dims, - kernel_size=1, - stride=1, - padding=0, - bias=bias, - ) - self.layers.append(proj) - - # create following upsample layers - for i in range(n_upsample_layers): - in_channels = intermediate_dims if i == 0 else output_dims - layer = nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=output_dims, - kernel_size=2, - stride=2, - padding=0, - bias=bias, - ) - self.layers.append(layer) - - def forward(self, features: torch.Tensor) -> torch.Tensor: - for layer in self.layers: - features = layer(features) - return features - - -class DepthProFeatureUpsample(nn.Module): - def __init__(self, config: DepthProConfig): - super().__init__() - self.config = config - self.n_scaled_images = len(self.config.scaled_images_ratios) - self.n_intermediate_hooks = len(self.config.intermediate_hook_ids) - - # for image_features - self.image_block = DepthProFeatureUpsampleBlock( - config=config, - input_dims=config.image_model_config.hidden_size, - intermediate_dims=config.image_model_config.hidden_size, - output_dims=config.scaled_images_feature_dims[0], - n_upsample_layers=1, - use_proj=False, - bias=True, - ) - - # for scaled_images_features - self.scaled_images = nn.ModuleList() - for i, feature_dims in enumerate(config.scaled_images_feature_dims): - block = DepthProFeatureUpsampleBlock( - config=config, - input_dims=config.patch_model_config.hidden_size, - intermediate_dims=feature_dims, - output_dims=feature_dims, - n_upsample_layers=1, - ) - self.scaled_images.append(block) - - # for intermediate_features - self.intermediate = nn.ModuleList() - for i, feature_dims in enumerate(config.intermediate_feature_dims): - intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims - block = DepthProFeatureUpsampleBlock( - config=config, - input_dims=config.patch_model_config.hidden_size, - intermediate_dims=intermediate_dims, - output_dims=feature_dims, - n_upsample_layers=2 + i, - ) - self.intermediate.append(block) - - def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: - features[0] = self.image_block(features[0]) - - for i in range(self.n_scaled_images): - features[i + 1] = self.scaled_images[i](features[i + 1]) - - for i in range(self.n_intermediate_hooks): - features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1]) - - return features - - -class DepthProFeatureProjection(nn.Module): - def __init__(self, config: DepthProConfig): - super().__init__() - self.config = config - - combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims - self.projections = nn.ModuleList() - for i, in_channels in enumerate(combined_feature_dims): - if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: - # projection for last layer can be ignored if input and output channels already match - self.projections.append(nn.Identity()) - else: - self.projections.append( - nn.Conv2d( - in_channels=in_channels, - out_channels=config.fusion_hidden_size, - kernel_size=3, - stride=1, - padding=1, - bias=False, - ) - ) - - def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: - projected_features = [] - for i, projection in enumerate(self.projections): - upsampled_feature = projection(features[i]) - projected_features.append(upsampled_feature) - return projected_features - - def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor: """Creates Patches from Batch.""" batch_size, num_channels, height, width = pixel_values.shape @@ -369,11 +237,10 @@ def reconstruct_feature_maps( return features -class DepthProEncoder(nn.Module): +class DepthProPatchEncoder(nn.Module): def __init__(self, config: DepthProConfig): super().__init__() self.config = config - self.fusion_hidden_size = config.fusion_hidden_size self.intermediate_hook_ids = config.intermediate_hook_ids self.intermediate_feature_dims = config.intermediate_feature_dims @@ -382,48 +249,17 @@ def __init__(self, config: DepthProConfig): self.scaled_images_feature_dims = config.scaled_images_feature_dims self.merge_padding_value = config.merge_padding_value - self.n_scaled_images = len(self.scaled_images_ratios) - self.n_intermediate_hooks = len(self.intermediate_hook_ids) - - # patch encoder - self.patch_encoder = AutoModel.from_config(config.patch_model_config) - - # image encoder - self.image_encoder = AutoModel.from_config(config.image_model_config) - - # upsample features - self.feature_upsample = DepthProFeatureUpsample(config) - - # for STEP 7: fuse low_res and image features - self.fuse_image_with_low_res = nn.Conv2d( - in_channels=config.scaled_images_feature_dims[0] * 2, - out_channels=config.scaled_images_feature_dims[0], - kernel_size=1, - stride=1, - padding=0, - bias=True, - ) + self.n_scaled_images = len(config.scaled_images_ratios) + self.n_intermediate_hooks = len(config.intermediate_hook_ids) + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size - # project features - self.feature_projection = DepthProFeatureProjection(config) + self.model = AutoModel.from_config(config.patch_model_config) def forward( self, pixel_values: torch.Tensor, head_mask: Optional[torch.Tensor] = None, - output_attentions: bool = False, - output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[tuple, DepthProOutput]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values.dim() != 4: - raise ValueError("Input tensor must have shape (batch_size, num_channels, height, width).") - + ) -> List[torch.Tensor]: batch_size, num_channels, height, width = pixel_values.shape if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size: @@ -457,39 +293,23 @@ def forward( n_patches_per_scaled_image = [len(i) for i in scaled_images] patches = torch.cat(scaled_images[::-1], dim=0) # -1 as patch encoder expects high res patches first - # STEP 3: apply patch and image encoder + # STEP 3: apply patch encoder - patch_encodings = self.patch_encoder( + encodings = self.model( # each patch is processed as a separate batch patches, head_mask=head_mask, # required for intermediate features output_hidden_states=self.n_intermediate_hooks > 0, - return_dict=return_dict, ) - scaled_images_last_hidden_state = torch.split_with_sizes(patch_encodings[0], n_patches_per_scaled_image[::-1]) - scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] + scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1]) # -1 (reverse list) as patch encoder returns high res patches first, we need low res first - - # scale the image to patch size for image_encoder - image_scaled_to_patch_size = F.interpolate( - pixel_values, - size=(self.config.patch_size, self.config.patch_size), - mode="bilinear", - align_corners=False, - ) - image_encodings = self.image_encoder( - pixel_values=image_scaled_to_patch_size, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) + scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1] # calculate base height and width # base height and width are the dimensions of the lowest resolution features - out_size = torch_int(image_encodings[0].shape[1] ** 0.5) - exponent_value = torch_int(math.log2(width / out_size)) + exponent_value = torch_int(math.log2(width / self.out_size)) base_height = height // 2**exponent_value base_width = width // 2**exponent_value @@ -515,7 +335,7 @@ def forward( intermediate_features = [] for i in range(self.n_intermediate_hooks): # +1 to correct index position as hidden_states contain embedding output as well - hidden_state = patch_encodings[2][self.intermediate_hook_ids[i] + 1] + hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1] padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1])) output_height = base_height * 2 ** (self.n_scaled_images - 1) output_width = base_width * 2 ** (self.n_scaled_images - 1) @@ -527,39 +347,112 @@ def forward( ) intermediate_features.append(features) - # STEP 6: get image features - (6) in diagram + # STEP 7: combine all features + features = [*scaled_images_features, *intermediate_features] + + return features + - image_features = reconstruct_feature_maps( - image_encodings[0], +class DepthProImageEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size + + self.model = AutoModel.from_config(config.image_model_config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image to patch size for image_encoder + pixel_values = F.interpolate( + pixel_values, + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, + ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + encodings[0], batch_size=batch_size, padding=0, output_size=(base_height, base_width), ) - # STEP 7: combine all features - features = [ - image_features, - *scaled_images_features, - *intermediate_features, - ] + if not return_dict: + return (encodings[0], features) + encodings[2:] # ignore last_hidden_state and poooler output - # STEP 8: upsample features - features = self.feature_upsample(features) + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) - # STEP 9: apply fusion - # (global features = low res features + image features) - # fuses image_features with lowest resolution features as they are of same size - global_features = torch.cat((features[1], features[0]), dim=1) - global_features = self.fuse_image_with_low_res(global_features) - features = [global_features, *features[2:]] - # STEP 10: project features - features = self.feature_projection(features) +class DepthProEncoder(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.intermediate_hook_ids = config.intermediate_hook_ids + self.intermediate_feature_dims = config.intermediate_feature_dims + self.scaled_images_ratios = config.scaled_images_ratios + self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios + self.scaled_images_feature_dims = config.scaled_images_feature_dims + self.merge_padding_value = config.merge_padding_value - # STEP 11: return output + self.n_scaled_images = len(self.scaled_images_ratios) + self.n_intermediate_hooks = len(self.intermediate_hook_ids) + + self.patch_encoder = DepthProPatchEncoder(config) + self.image_encoder = DepthProImageEncoder(config) + + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, DepthProOutput]: + batch_size, num_channels, height, width = pixel_values.shape + + patch_features = self.patch_encoder( + pixel_values, + head_mask=head_mask, + ) + image_encodings = self.image_encoder( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_features = image_encodings[1] # index 1 contains features + + features = [image_features, *patch_features] if not return_dict: - return (image_encodings[0], features) + image_encodings[2:] # ignore last_hidden_state and poooler output + return (image_encodings[0], features) + image_encodings[2:] return DepthProOutput( last_hidden_state=image_encodings.last_hidden_state, @@ -569,6 +462,164 @@ def forward( ) +class DepthProFeatureUpsampleBlock(nn.Module): + def __init__( + self, + config: DepthProConfig, + input_dims: int, + intermediate_dims: int, + output_dims: int, + n_upsample_layers: int, + use_proj: bool = True, + bias: bool = False, + ): + super().__init__() + self.config = config + self.layers = nn.ModuleList() + + # create first projection layer + if use_proj: + proj = nn.Conv2d( + in_channels=input_dims, + out_channels=intermediate_dims, + kernel_size=1, + stride=1, + padding=0, + bias=bias, + ) + self.layers.append(proj) + + # create following upsample layers + for i in range(n_upsample_layers): + in_channels = intermediate_dims if i == 0 else output_dims + layer = nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=output_dims, + kernel_size=2, + stride=2, + padding=0, + bias=bias, + ) + self.layers.append(layer) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + for layer in self.layers: + features = layer(features) + return features + + +class DepthProFeatureUpsample(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.n_scaled_images = len(self.config.scaled_images_ratios) + self.n_intermediate_hooks = len(self.config.intermediate_hook_ids) + + # for image_features + self.image_block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.image_model_config.hidden_size, + intermediate_dims=config.image_model_config.hidden_size, + output_dims=config.scaled_images_feature_dims[0], + n_upsample_layers=1, + use_proj=False, + bias=True, + ) + + # for scaled_images_features + self.scaled_images = nn.ModuleList() + for i, feature_dims in enumerate(config.scaled_images_feature_dims): + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=feature_dims, + output_dims=feature_dims, + n_upsample_layers=1, + ) + self.scaled_images.append(block) + + # for intermediate_features + self.intermediate = nn.ModuleList() + for i, feature_dims in enumerate(config.intermediate_feature_dims): + intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims + block = DepthProFeatureUpsampleBlock( + config=config, + input_dims=config.patch_model_config.hidden_size, + intermediate_dims=intermediate_dims, + output_dims=feature_dims, + n_upsample_layers=2 + i, + ) + self.intermediate.append(block) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + features[0] = self.image_block(features[0]) + + for i in range(self.n_scaled_images): + features[i + 1] = self.scaled_images[i](features[i + 1]) + + for i in range(self.n_intermediate_hooks): + features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1]) + + return features + + +class DepthProFeatureProjection(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims + self.projections = nn.ModuleList() + for i, in_channels in enumerate(combined_feature_dims): + if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size: + # projection for last layer can be ignored if input and output channels already match + self.projections.append(nn.Identity()) + else: + self.projections.append( + nn.Conv2d( + in_channels=in_channels, + out_channels=config.fusion_hidden_size, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + ) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + projected_features = [] + for i, projection in enumerate(self.projections): + upsampled_feature = projection(features[i]) + projected_features.append(upsampled_feature) + return projected_features + + +class DepthProNeck(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + + self.feature_upsample = DepthProFeatureUpsample(config) + self.fuse_image_with_low_res = nn.Conv2d( + in_channels=config.scaled_images_feature_dims[0] * 2, + out_channels=config.scaled_images_feature_dims[0], + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.feature_projection = DepthProFeatureProjection(config) + + def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: + features = self.feature_upsample(features) + # global features = low res features + image features + global_features = torch.cat((features[1], features[0]), dim=1) + global_features = self.fuse_image_with_low_res(global_features) + features = [global_features, *features[2:]] + features = self.feature_projection(features) + return features + + # General docstring _CONFIG_FOR_DOC = "DepthProConfig" @@ -660,22 +711,20 @@ def __init__(self, config): super().__init__(config) self.config = config self.encoder = DepthProEncoder(config) + self.neck = DepthProNeck(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): - # TODO: return hidden_states from patch_encodings instead of image_encodings - # return self.encoder.patch_encoder.embeddings.patch_embeddings - return self.encoder.image_encoder.embeddings.patch_embeddings + return self.encoder.image_encoder.model.get_input_embeddings() def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ - for layer, heads in heads_to_prune.items(): - self.encoder.patch_encoder.encoder.layer[layer].attention.prune_heads(heads) - self.encoder.image_encoder.encoder.layer[layer].attention.prune_heads(heads) + self.encoder.patch_encoder.model._prune_heads(heads_to_prune) + self.encoder.image_encoder.model._prune_heads(heads_to_prune) @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) @@ -727,8 +776,18 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) + features = encodings[1] # index 1 contains features + features = self.neck(features) + + if not return_dict: + return (encodings[0], features) + encodings[2:] - return encodings + return DepthProOutput( + last_hidden_state=encodings.last_hidden_state, + features=features, + hidden_states=encodings.hidden_states, + attentions=encodings.attentions, + ) # Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro @@ -863,25 +922,63 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: return fused_hidden_states -class DepthProFOVModel(nn.Module): +class DepthProFOVEncoder(nn.Module): def __init__(self, config: DepthProConfig): super().__init__() self.config = config - self.fusion_hidden_size = config.fusion_hidden_size + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size - self.out_size = config.fov_model_config.image_size // config.fov_model_config.patch_size + self.model = AutoModel.from_config(config.fov_model_config) + self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2) - self.encoder = AutoModel.from_config(config.fov_model_config) - self.encoder_neck = nn.Linear(config.fov_model_config.hidden_size, self.fusion_hidden_size // 2) - self.global_neck = nn.Sequential( - nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), + def forward( + self, + pixel_values: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + + # scale the image to patch size for image_encoder + pixel_values = F.interpolate( + pixel_values, + size=(self.config.patch_size, self.config.patch_size), + mode="bilinear", + align_corners=False, ) + encodings = self.model( + pixel_values=pixel_values, + head_mask=head_mask, + ) + hidden_state = encodings[0] + hidden_state = self.neck(hidden_state) + + # calculate base height and width + # base height and width are the dimensions of the lowest resolution features + exponent_value = torch_int(math.log2(width / self.out_size)) + base_height = height // 2**exponent_value + base_width = width // 2**exponent_value + + features = reconstruct_feature_maps( + hidden_state, + batch_size=batch_size, + padding=0, + output_size=(base_height, base_width), + ) + + return features + + +class DepthProFOVHead(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size # create initial head layers - self.head = nn.Sequential() + self.layers = nn.ModuleList() for i in range(config.num_fov_head_layers): - self.head.append( + self.layers.append( nn.Conv2d( math.ceil(self.fusion_hidden_size / 2 ** (i + 1)), math.ceil(self.fusion_hidden_size / 2 ** (i + 2)), @@ -890,55 +987,53 @@ def __init__(self, config: DepthProConfig): padding=1, ) ) - self.head.append(nn.ReLU(True)) + self.layers.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) final_kernal_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) - self.head.append( + self.layers.append( nn.Conv2d( in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0 ) ) - def forward( - self, - pixel_values: torch.Tensor, - global_features: torch.Tensor, - head_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - batch_size, num_channels, height, width = pixel_values.shape - - image_scaled_to_patch_size = F.interpolate( - pixel_values, - size=(self.config.patch_size, self.config.patch_size), + def forward(self, features: torch.Tensor) -> torch.Tensor: + features = F.interpolate( + features, + size=(self.out_size, self.out_size), mode="bilinear", align_corners=False, ) - encodings = self.encoder( - image_scaled_to_patch_size, - head_mask=head_mask, - ) - hidden_state = encodings[0] - hidden_state = self.encoder_neck(hidden_state) + for layer in self.layers: + features = layer(features) + return features - fov_features = reconstruct_feature_maps( - hidden_state, - batch_size=batch_size, - padding=0, - output_size=(self.out_size, self.out_size), + +class DepthProFOVModel(nn.Module): + def __init__(self, config: DepthProConfig): + super().__init__() + self.config = config + self.fusion_hidden_size = config.fusion_hidden_size + + self.fov_encoder = DepthProFOVEncoder(config) + self.global_neck = nn.Sequential( + nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(True), ) + self.head = DepthProFOVHead(config) + def forward( + self, + pixel_values: torch.Tensor, + global_features: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + fov_features = self.fov_encoder(pixel_values, head_mask) global_features = self.global_neck(global_features) - global_features = F.interpolate( - global_features, - size=(self.out_size, self.out_size), - mode="bilinear", - align_corners=False, - ) fov_features = fov_features + global_features fov_output = self.head(fov_features) - fov_output = fov_output.reshape(batch_size) + fov_output = fov_output.squeeze() return fov_output From 1522c530ef979894b39377118ae1fe4516e6b096 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Sat, 1 Feb 2025 19:42:28 +0500 Subject: [PATCH 114/151] Update src/transformers/models/depth_pro/image_processing_depth_pro.py Co-authored-by: Pavel Iakubovskii --- src/transformers/models/depth_pro/image_processing_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 8b7c84b71943..60bea8460cb4 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -20,7 +20,7 @@ if TYPE_CHECKING: - from ...modeling_outputs import DepthProDepthEstimatorOutput + from .modeling_depth_pro import DepthProDepthEstimatorOutput from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import to_channel_dimension_format From 131817ad1ea403cee411fbdce26ecef32df3b39a Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Sat, 1 Feb 2025 19:42:40 +0500 Subject: [PATCH 115/151] Update src/transformers/models/depth_pro/image_processing_depth_pro_fast.py Co-authored-by: Pavel Iakubovskii --- .../models/depth_pro/image_processing_depth_pro_fast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 482755a83f52..15ac15a90ddb 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -37,7 +37,8 @@ if TYPE_CHECKING: - from ...modeling_outputs import DepthProDepthEstimatorOutput + from .modeling_depth_pro import DepthProDepthEstimatorOutput + logger = logging.get_logger(__name__) From 72a1f0cacc253cf7068293d2f4bba74e603a94d1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 1 Feb 2025 19:56:49 +0500 Subject: [PATCH 116/151] update docs --- docs/source/en/model_doc/depth_pro.md | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 20b526dda76e..a2076ec8cb75 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -26,7 +26,7 @@ The abstract from the paper is the following: *We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.* -drawing DepthPro Outputs. Taken from the official code. @@ -113,19 +113,6 @@ Or set `use_fov_model=True` when initializing the model, which overrides the val >>> model = DepthProForDepthEstimation(config, use_fov_model=True) ``` -### Image Resolution and Aspect Ratio - -The network can process images of different resolutions and aspect ratios and the predicted depth size can be calculated using the following formula: - -$\text{Predicted Depth Size} = \frac{2^{N+1} \cdot S}{P}$ - -Where: -- $N = \text{len}(\text{intermediate\_hook\_ids}) + \text{len}(\text{scaled\_images\_ratios})$ -- $S = \text{image\_model\_config.image\_size}$ -- $P = \text{image\_model\_config.patch\_size}$ - -The aspect ratio of the raw predicted depth is maintained as the aspect ratio of the input image. - ### Using Scaled Dot Product Attention (SDPA) PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function From aed7e3dacaa8866a292e87767596d165bdd76f01 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 1 Feb 2025 20:17:48 +0500 Subject: [PATCH 117/151] improve merge_patches --- .../models/depth_pro/modeling_depth_pro.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index b9ddc579ef36..326ab7296482 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -177,18 +177,27 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch boxes_in_row = [] for w in range(sqrt_n_patches_per_batch): box = patches[batch_size * i : batch_size * (i + 1)] + + # collect paddings + paddings = [0, 0, 0, 0] if h != 0: # remove pad from height if box is not at top border - box = box[..., padding:, :] + paddings[0] = padding if w != 0: # remove pad from width if box is not at left border - box = box[..., :, padding:] + paddings[2] = padding if h != sqrt_n_patches_per_batch - 1: # remove pad from height if box is not at bottom border - box = box[..., : box.shape[-2] - padding, :] + paddings[1] = padding if w != sqrt_n_patches_per_batch - 1: # remove pad from width if box is not at right border - box = box[..., :, : box.shape[-1] - padding] + paddings[3] = padding + + # remove paddings + _, _, box_h, box_w = box.shape + pad_top, pad_bottom, pad_left, pad_right = paddings + box = box[:, :, pad_top:box_h - pad_bottom, pad_left:box_w - pad_right] + boxes_in_row.append(box) i += 1 boxes_in_row = torch.cat(boxes_in_row, dim=-1) From 405bee3de4850a0688e6b7466223a4da6f98ef86 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 1 Feb 2025 20:18:10 +0500 Subject: [PATCH 118/151] use flatten with fov_output --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 326ab7296482..f5f741d3d758 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1042,7 +1042,7 @@ def forward( fov_features = fov_features + global_features fov_output = self.head(fov_features) - fov_output = fov_output.squeeze() + fov_output = fov_output.flatten() return fov_output From a8528da17b77948e4c226e9f5c5881c163155469 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Sat, 1 Feb 2025 20:19:41 +0500 Subject: [PATCH 119/151] ruff formatting --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index f5f741d3d758..b8cca868e8d1 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -196,7 +196,7 @@ def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch # remove paddings _, _, box_h, box_w = box.shape pad_top, pad_bottom, pad_left, pad_right = paddings - box = box[:, :, pad_top:box_h - pad_bottom, pad_left:box_w - pad_right] + box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right] boxes_in_row.append(box) i += 1 From 31383e12f45a9bede3e73fad3e3ebd862a1492b5 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Mon, 3 Feb 2025 18:59:41 +0500 Subject: [PATCH 120/151] update resources section in docs Co-authored-by: Pavel Iakubovskii --- docs/source/en/model_doc/depth_pro.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index a2076ec8cb75..9a18cfc8735d 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -142,6 +142,8 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` ## Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro: + - Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073) - Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro) - DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb) @@ -149,8 +151,6 @@ On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba) - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth) - - If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. ## DepthProConfig From 641cb841507d80e2bc122899498d7b33280270e2 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Mon, 3 Feb 2025 19:00:52 +0500 Subject: [PATCH 121/151] fix typo "final_kernal_size" Co-authored-by: Pavel Iakubovskii --- src/transformers/models/depth_pro/modeling_depth_pro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index b8cca868e8d1..5e481045fed2 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -999,10 +999,10 @@ def __init__(self, config: DepthProConfig): self.layers.append(nn.ReLU(True)) # calculate expected shapes to finally generate a scalar output from final head layer final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1)) - final_kernal_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) + final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1) self.layers.append( nn.Conv2d( - in_channels=final_in_channels, out_channels=1, kernel_size=final_kernal_size, stride=1, padding=0 + in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0 ) ) From 6af8a1169d879c5296b82008cf08f7ebb7852354 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Mon, 3 Feb 2025 19:02:24 +0500 Subject: [PATCH 122/151] fix output typehint for DepthProDepthEstimator Co-authored-by: Pavel Iakubovskii --- src/transformers/models/depth_pro/modeling_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 5e481045fed2..5cd67b117060 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1114,7 +1114,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor]]: + ) -> Union[Tuple[torch.Tensor], DepthProDepthEstimatorOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): Ground truth depth estimation maps for computing the loss. From abd5307c7280a309d6e3b3ed09cd3d673dec4ad6 Mon Sep 17 00:00:00 2001 From: Armaghan Shakir Date: Mon, 3 Feb 2025 19:03:03 +0500 Subject: [PATCH 123/151] residual operation in 2 steps Co-authored-by: Pavel Iakubovskii --- src/transformers/models/depth_pro/modeling_depth_pro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 5cd67b117060..3c6039030ca8 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -883,11 +883,11 @@ def __init__(self, config: DepthProConfig, use_deconv: bool = True): ) self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True) - self.skip_add = nn.quantized.FloatFunctional() def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor: if residual is not None: - hidden_state = self.skip_add.add(hidden_state, self.residual_layer1(residual)) + residual = self.residual_layer1(residual) + hidden_state = hidden_state + residual hidden_state = self.residual_layer2(hidden_state) if self.use_deconv: From 8dc2751cbe3600177a4564117e79384132a0a052 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 19:15:30 +0500 Subject: [PATCH 124/151] use image_size instead of global patch_size in interpolation --- .../models/depth_pro/modeling_depth_pro.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3c6039030ca8..c270ecd0018b 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -380,10 +380,11 @@ def forward( ) -> Union[tuple, DepthProOutput]: batch_size, num_channels, height, width = pixel_values.shape - # scale the image to patch size for image_encoder + # scale the image for image_encoder + size = self.config.image_model_config.image_size pixel_values = F.interpolate( pixel_values, - size=(self.config.patch_size, self.config.patch_size), + size=(size, size), mode="bilinear", align_corners=False, ) @@ -947,10 +948,11 @@ def forward( ) -> torch.Tensor: batch_size, num_channels, height, width = pixel_values.shape - # scale the image to patch size for image_encoder + # scale the image for fov_encoder + size = self.config.fov_model_config.image_size pixel_values = F.interpolate( pixel_values, - size=(self.config.patch_size, self.config.patch_size), + size=(size, size), mode="bilinear", align_corners=False, ) From 2f88694b3771478cfe1d9393fcc1b10977962b2c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 20:54:56 +0500 Subject: [PATCH 125/151] replace all Sequential with ModuleList --- .../models/depth_pro/modeling_depth_pro.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index c270ecd0018b..888e9d6dce39 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1027,10 +1027,10 @@ def __init__(self, config: DepthProConfig): self.fusion_hidden_size = config.fusion_hidden_size self.fov_encoder = DepthProFOVEncoder(config) - self.global_neck = nn.Sequential( + self.global_neck = nn.ModuleList([ nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), nn.ReLU(True), - ) + ]) self.head = DepthProFOVHead(config) def forward( @@ -1040,7 +1040,9 @@ def forward( head_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: fov_features = self.fov_encoder(pixel_values, head_mask) - global_features = self.global_neck(global_features) + + for layer in self.global_neck: + global_features = layer(global_features) fov_features = fov_features + global_features fov_output = self.head(fov_features) @@ -1062,7 +1064,7 @@ def __init__(self, config): self.config = config features = config.fusion_hidden_size - self.head = nn.Sequential( + self.head = nn.ModuleList([ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True @@ -1071,11 +1073,13 @@ def __init__(self, config): nn.ReLU(True), nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), nn.ReLU(), - ) + ]) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for layer in self.head: + hidden_states = layer(hidden_states) - def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor: - predicted_depth = self.head(hidden_states) - predicted_depth = predicted_depth.squeeze(dim=1) + predicted_depth = hidden_states.squeeze(dim=1) return predicted_depth From 208ee26b9ef34577a1d4d18dd85a65b7a6a4556a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 21:34:50 +0500 Subject: [PATCH 126/151] update fov --- docs/source/en/model_doc/depth_pro.md | 3 ++- .../depth_pro/image_processing_depth_pro.py | 18 ++++++++++-------- .../image_processing_depth_pro_fast.py | 18 ++++++++++-------- .../models/depth_pro/modeling_depth_pro.py | 10 +++++++--- .../depth_pro/test_modeling_depth_pro.py | 8 ++++---- 5 files changed, 33 insertions(+), 24 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 9a18cfc8735d..a701497caea8 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -58,7 +58,8 @@ The DepthPro model processes an input image by first downsampling it at multiple ... outputs, target_sizes=[(image.height, image.width)], ... ) ->>> fov = post_processed_output[0]["fov"] +>>> field_of_view = post_processed_output[0]["field_of_view"] +>>> focal_length = post_processed_output[0]["focal_length"] >>> depth = post_processed_output[0]["predicted_depth"] >>> depth = (depth - depth.min()) / depth.max() >>> depth = depth * 255. diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 60bea8460cb4..9a9568fdde05 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -337,9 +337,9 @@ def post_process_depth_estimation( target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, ) -> Dict[str, List[TensorType]]: """ - Post-processes the raw depth predictions from the model to generate final depth predictions and optionally - resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) - and adjusts depth values accordingly. + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. Args: outputs ([`DepthProDepthEstimatorOutput`]): @@ -351,7 +351,7 @@ def post_process_depth_estimation( Returns: `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. Raises: `ValueError`: @@ -360,7 +360,7 @@ def post_process_depth_estimation( requires_backends(self, "torch") predicted_depth = outputs.predicted_depth - fov = outputs.fov + fov = outputs.field_of_view batch_size = len(predicted_depth) @@ -373,12 +373,13 @@ def post_process_depth_estimation( fov = [None] * batch_size if fov is None else fov target_sizes = [None] * batch_size if target_sizes is None else target_sizes for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None if target_size is not None: # scale image w.r.t fov if fov_value is not None: width = target_size[1] - fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * width / fov_value + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length # interpolate depth = torch.nn.functional.interpolate( @@ -395,7 +396,8 @@ def post_process_depth_estimation( results.append( { "predicted_depth": depth, - "fov": fov_value, + "field_of_view": fov_value, + "focal_length": focal_length, } ) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 15ac15a90ddb..a56ae831960d 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -317,9 +317,9 @@ def post_process_depth_estimation( target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None, ) -> Dict[str, List[TensorType]]: """ - Post-processes the raw depth predictions from the model to generate final depth predictions and optionally - resizes them to specified target sizes. This function supports scaling based on the field of view (FoV) - and adjusts depth values accordingly. + Post-processes the raw depth predictions from the model to generate + final depth predictions which is caliberated using the field of view if provided + and resized to specified target sizes if provided. Args: outputs ([`DepthProDepthEstimatorOutput`]): @@ -331,7 +331,7 @@ def post_process_depth_estimation( Returns: `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth - predictions. + predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`. Raises: `ValueError`: @@ -340,7 +340,7 @@ def post_process_depth_estimation( requires_backends(self, "torch") predicted_depth = outputs.predicted_depth - fov = outputs.fov + fov = outputs.field_of_view batch_size = len(predicted_depth) @@ -353,12 +353,13 @@ def post_process_depth_estimation( fov = [None] * batch_size if fov is None else fov target_sizes = [None] * batch_size if target_sizes is None else target_sizes for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes): + focal_length = None if target_size is not None: # scale image w.r.t fov if fov_value is not None: width = target_size[1] - fov_value = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) - depth = depth * width / fov_value + focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value)) + depth = depth * width / focal_length # interpolate depth = torch.nn.functional.interpolate( @@ -375,7 +376,8 @@ def post_process_depth_estimation( results.append( { "predicted_depth": depth, - "fov": fov_value, + "field_of_view": fov_value, + "focal_length": focal_length, } ) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 888e9d6dce39..61694940b6da 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -78,7 +78,7 @@ class DepthProDepthEstimatorOutput(ModelOutput): Classification (or regression if config.num_labels==1) loss. predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): Predicted depth for each pixel. - fov (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): + field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided): Field of View Scaler. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + @@ -95,7 +95,7 @@ class DepthProDepthEstimatorOutput(ModelOutput): loss: Optional[torch.FloatTensor] = None predicted_depth: torch.FloatTensor = None - fov: Optional[torch.FloatTensor] = None + field_of_view: Optional[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None attentions: Optional[Tuple[torch.FloatTensor, ...]] = None @@ -1153,6 +1153,10 @@ def forward( ... outputs, target_sizes=[(image.height, image.width)], ... ) + >>> # get the field of view (fov) predictions + >>> field_of_view = post_processed_output[0]["field_of_view"] + >>> focal_length = post_processed_output[0]["focal_length"] + >>> # visualize the prediction >>> predicted_depth = post_processed_output[0]["predicted_depth"] >>> depth = predicted_depth * 255 / predicted_depth.max() @@ -1198,7 +1202,7 @@ def forward( return DepthProDepthEstimatorOutput( loss=loss, predicted_depth=predicted_depth, - fov=fov, + field_of_view=fov, hidden_states=depth_pro_outputs.hidden_states, attentions=depth_pro_outputs.attentions, ) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 2f728ada14df..1e4ceadbd4eb 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -167,8 +167,8 @@ def create_and_check_for_fov(self, config, pixel_values, labels): row_pixel_values = pixel_values[:1] with torch.no_grad(): - model_batched_output_fov = model(batched_pixel_values).fov - model_row_output_fov = model(row_pixel_values).fov + model_batched_output_fov = model(batched_pixel_values).field_of_view + model_row_output_fov = model(row_pixel_values).field_of_view # check if fov is returned self.parent.assertIsNotNone(model_batched_output_fov) @@ -372,10 +372,10 @@ def test_inference_depth_estimation(self): # verify the predicted fov expected_shape = torch.Size((1,)) - self.assertEqual(outputs.fov.shape, expected_shape) + self.assertEqual(outputs.field_of_view.shape, expected_shape) expected_slice = torch.tensor([47.2459]).to(torch_device) - torch.testing.assert_close(outputs.fov, expected_slice, atol=1e-4, rtol=1e-4) + torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4) def test_post_processing_depth_estimation(self): model_path = "geetu040/DepthPro" From bc63511b770e207b0b03690047e7a2c04cfe54ff Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 22:19:56 +0500 Subject: [PATCH 127/151] update heads --- .../models/depth_pro/modeling_depth_pro.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 61694940b6da..1a7476b5f22f 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1027,10 +1027,8 @@ def __init__(self, config: DepthProConfig): self.fusion_hidden_size = config.fusion_hidden_size self.fov_encoder = DepthProFOVEncoder(config) - self.global_neck = nn.ModuleList([ - nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1), - nn.ReLU(True), - ]) + self.conv = nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1) + self.activation = nn.ReLU(inplace=True) self.head = DepthProFOVHead(config) def forward( @@ -1041,8 +1039,8 @@ def forward( ) -> torch.Tensor: fov_features = self.fov_encoder(pixel_values, head_mask) - for layer in self.global_neck: - global_features = layer(global_features) + global_features = self.conv(global_features) + global_features = self.activation(global_features) fov_features = fov_features + global_features fov_output = self.head(fov_features) @@ -1064,7 +1062,7 @@ def __init__(self, config): self.config = config features = config.fusion_hidden_size - self.head = nn.ModuleList([ + self.layers = nn.ModuleList([ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), nn.ConvTranspose2d( in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True @@ -1076,7 +1074,7 @@ def __init__(self, config): ]) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - for layer in self.head: + for layer in self.layers: hidden_states = layer(hidden_states) predicted_depth = hidden_states.squeeze(dim=1) From e33a531d2f88460302de13693ab1dbc47a73a9aa Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 22:54:03 +0500 Subject: [PATCH 128/151] fix and update conversion script for heads --- .../depth_pro/convert_depth_pro_weights_to_hf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index bddc3114ffec..ddb0b9bd9724 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -31,7 +31,7 @@ # fmt: off ORIGINAL_TO_CONVERTED_KEY_MAPPING = { - # encoder and head + # encoder r"encoder.(patch|image)_encoder.cls_token": r"depth_pro.encoder.\1_encoder.model.embeddings.cls_token", r"encoder.(patch|image)_encoder.pos_embed": r"depth_pro.encoder.\1_encoder.model.embeddings.position_embeddings", r"encoder.(patch|image)_encoder.patch_embed.proj.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.embeddings.patch_embeddings.projection.\2", @@ -42,7 +42,6 @@ r"encoder.(patch|image)_encoder.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"depth_pro.encoder.\1_encoder.model.encoder.layer.\2.mlp.fc\3.\4", r"encoder.(patch|image)_encoder.norm.(weight|bias)": r"depth_pro.encoder.\1_encoder.model.layernorm.\2", r"encoder.fuse_lowres.(weight|bias)": r"depth_pro.neck.fuse_image_with_low_res.\1", - r"head.(\d+).(weight|bias)": r"head.head.\1.\2", # fov r"fov.encoder.0.cls_token": r"fov_model.fov_encoder.model.embeddings.cls_token", @@ -54,9 +53,12 @@ r"fov.encoder.0.blocks.(\d+).ls(\d+).gamma": r"fov_model.fov_encoder.model.encoder.layer.\1.layer_scale\2.lambda1", r"fov.encoder.0.blocks.(\d+).mlp.fc(\d+).(weight|bias)": r"fov_model.fov_encoder.model.encoder.layer.\1.mlp.fc\2.\3", r"fov.encoder.0.norm.(weight|bias)": r"fov_model.fov_encoder.model.layernorm.\1", - r"fov.downsample.(\d+).(weight|bias)": r"fov_model.global_neck.\1.\2", + r"fov.downsample.0.(weight|bias)": r"fov_model.conv.\1", r"fov.encoder.1.(weight|bias)": r"fov_model.fov_encoder.neck.\1", - r"fov.head.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", + r"fov.head.(\d+).(weight|bias)": r"fov_model.head.layers.\1.\2", + + # head + r"head.(\d+).(weight|bias)": r"head.layers.\1.\2", # upsamples r"encoder.upsample_lowres.(weight|bias)": r"depth_pro.neck.feature_upsample.image_block.layers.0.\1", From 8c0e81a975a9d161957b53a9facc2f4b53476107 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 22:58:41 +0500 Subject: [PATCH 129/151] ruff formatting --- .../models/depth_pro/modeling_depth_pro.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 1a7476b5f22f..26384139c267 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1027,7 +1027,9 @@ def __init__(self, config: DepthProConfig): self.fusion_hidden_size = config.fusion_hidden_size self.fov_encoder = DepthProFOVEncoder(config) - self.conv = nn.Conv2d(self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1) + self.conv = nn.Conv2d( + self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1 + ) self.activation = nn.ReLU(inplace=True) self.head = DepthProFOVHead(config) @@ -1062,16 +1064,23 @@ def __init__(self, config): self.config = config features = config.fusion_hidden_size - self.layers = nn.ModuleList([ - nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), - nn.ConvTranspose2d( - in_channels=features // 2, out_channels=features // 2, kernel_size=2, stride=2, padding=0, bias=True - ), - nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), - nn.ReLU(True), - nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), - nn.ReLU(), - ]) + self.layers = nn.ModuleList( + [ + nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1), + nn.ConvTranspose2d( + in_channels=features // 2, + out_channels=features // 2, + kernel_size=2, + stride=2, + padding=0, + bias=True, + ), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + ] + ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: for layer in self.layers: From 524dda6f7a03a509ca4c66fbcdd255bc3eed158a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Mon, 3 Feb 2025 23:24:24 +0500 Subject: [PATCH 130/151] remove float32 conversion --- .../models/depth_pro/convert_depth_pro_weights_to_hf.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index ddb0b9bd9724..8efc830f924b 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -164,10 +164,6 @@ def write_model( file_path = hf_hub_download(hf_repo_id, "depth_pro.pt") loaded = torch.load(file_path, weights_only=True) - # ensure state_dict is in float32 - for key in loaded.keys(): - loaded[key] = loaded[key].to(torch.float32) - print("Converting model...") all_keys = list(loaded.keys()) new_keys = convert_old_keys_to_new_keys(all_keys) From a87d26a61bc6221311399297010e56f7338f5213 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 4 Feb 2025 21:16:15 +0500 Subject: [PATCH 131/151] use "Fov" instead of "FOV" in class names --- .../models/depth_pro/modeling_depth_pro.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 26384139c267..aa12e995b3a2 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -677,7 +677,7 @@ def forward(self, features: List[torch.Tensor]) -> List[torch.Tensor]: Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. use_fov_model (`bool`, *optional*, defaults to `True`): - Whether to use `DepthProFOVModel` to generate the field of view. + Whether to use `DepthProFovModel` to generate the field of view. """ @@ -932,7 +932,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: return fused_hidden_states -class DepthProFOVEncoder(nn.Module): +class DepthProFovEncoder(nn.Module): def __init__(self, config: DepthProConfig): super().__init__() self.config = config @@ -979,7 +979,7 @@ def forward( return features -class DepthProFOVHead(nn.Module): +class DepthProFovHead(nn.Module): def __init__(self, config: DepthProConfig): super().__init__() self.config = config @@ -1020,18 +1020,18 @@ def forward(self, features: torch.Tensor) -> torch.Tensor: return features -class DepthProFOVModel(nn.Module): +class DepthProFovModel(nn.Module): def __init__(self, config: DepthProConfig): super().__init__() self.config = config self.fusion_hidden_size = config.fusion_hidden_size - self.fov_encoder = DepthProFOVEncoder(config) + self.fov_encoder = DepthProFovEncoder(config) self.conv = nn.Conv2d( self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1 ) self.activation = nn.ReLU(inplace=True) - self.head = DepthProFOVHead(config) + self.head = DepthProFovHead(config) def forward( self, @@ -1112,7 +1112,7 @@ def __init__(self, config, use_fov_model=None): self.head = DepthProDepthEstimationHead(config) # dinov2 (vit) like encoder - self.fov_model = DepthProFOVModel(config) if self.use_fov_model else None + self.fov_model = DepthProFovModel(config) if self.use_fov_model else None # Initialize weights and apply final processing self.post_init() From 5fccbff8de4eac6a1a161a3f620b71148d7d3268 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 4 Feb 2025 21:18:45 +0500 Subject: [PATCH 132/151] use "Fov" instead of "FOV" in config docs --- src/transformers/models/depth_pro/configuration_depth_pro.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index ffc8033b55c4..36de741b704a 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -58,9 +58,9 @@ class DepthProConfig(PretrainedConfig): use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`): Whether to use bias in the pre-activate residual units of the fusion blocks. use_fov_model (`bool`, *optional*, defaults to `False`): - Whether to use `DepthProFOVModel` to generate the field of view. + Whether to use `DepthProFovModel` to generate the field of view. num_fov_head_layers (`int`, *optional*, defaults to 2): - Number of convolution layers in the head of `DepthProFOVModel`. + Number of convolution layers in the head of `DepthProFovModel`. image_model_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*): The configuration of the image encoder model, which is loaded using the [`AutoModel`] API. By default, Dinov2 model is used as backbone. From 24f1413abd06110de00e5a53579d16bd62707d34 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 4 Feb 2025 21:44:46 +0500 Subject: [PATCH 133/151] remove prune_heads --- src/transformers/models/depth_pro/modeling_depth_pro.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index aa12e995b3a2..173adc8ac2d2 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -728,14 +728,6 @@ def __init__(self, config): def get_input_embeddings(self): return self.encoder.image_encoder.model.get_input_embeddings() - def _prune_heads(self, heads_to_prune): - """ - Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base - class PreTrainedModel - """ - self.encoder.patch_encoder.model._prune_heads(heads_to_prune) - self.encoder.image_encoder.model._prune_heads(heads_to_prune) - @add_start_docstrings_to_model_forward(DEPTH_PRO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( From a3dab1840d6ef19d416a9bcdeeadf8e9274be44f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 4 Feb 2025 22:10:38 +0500 Subject: [PATCH 134/151] update fusion stage --- .../depth_pro/convert_depth_pro_weights_to_hf.py | 16 +++++++++++----- .../models/depth_pro/modeling_depth_pro.py | 13 +++++++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 8efc830f924b..07aebbe18607 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -75,14 +75,20 @@ ), # fusion stage - r"decoder.fusions.(\d+).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( - f"fusion_stage.layers.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" + r"decoder.fusions.([1234]).resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.intermediate.{4-int(match.group(1))}.residual_layer{match.group(2)}.convolution{(int(match.group(3))+1)//2}.{match.group(4)}" ), - r"decoder.fusions.(\d+).out_conv.(weight|bias)": lambda match: ( - f"fusion_stage.layers.{4-int(match.group(1))}.projection.{match.group(2)}" + r"decoder.fusions.0.resnet(\d+).residual.(\d+).(weight|bias)": lambda match: ( + f"fusion_stage.final.residual_layer{match.group(1)}.convolution{(int(match.group(2))+1)//2}.{match.group(3)}" + ), + r"decoder.fusions.([1234]).out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.intermediate.{4-int(match.group(1))}.projection.{match.group(2)}" + ), + r"decoder.fusions.0.out_conv.(weight|bias)": lambda match: ( + f"fusion_stage.final.projection.{match.group(1)}" ), r"decoder.fusions.(\d+).deconv.(weight|bias)": lambda match: ( - f"fusion_stage.layers.{4-int(match.group(1))}.deconv.{match.group(2)}" + f"fusion_stage.intermediate.{4-int(match.group(1))}.deconv.{match.group(2)}" ), } # fmt: on diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 173adc8ac2d2..d994d66eeeff 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -898,11 +898,12 @@ def __init__(self, config): self.config = config self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios) - self.layers = nn.ModuleList() + self.intermediate = nn.ModuleList() for _ in range(self.num_layers - 1): - self.layers.append(DepthProFeatureFusionLayer(config)) + self.intermediate.append(DepthProFeatureFusionLayer(config)) + # final layer doesnot require deconvolution - self.layers.append(DepthProFeatureFusionLayer(config, use_deconv=False)) + self.final = DepthProFeatureFusionLayer(config, use_deconv=False) def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: if self.num_layers != len(hidden_states): @@ -913,7 +914,7 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: fused_hidden_states = [] fused_hidden_state = None - for hidden_state, layer in zip(hidden_states, self.layers): + for hidden_state, layer in zip(hidden_states[:-1], self.intermediate): if fused_hidden_state is None: # first layer only uses the last hidden_state fused_hidden_state = layer(hidden_state) @@ -921,6 +922,10 @@ def forward(self, hidden_states: List[torch.Tensor]) -> List[torch.Tensor]: fused_hidden_state = layer(fused_hidden_state, hidden_state) fused_hidden_states.append(fused_hidden_state) + hidden_state = hidden_states[-1] + fused_hidden_state = self.final(fused_hidden_state, hidden_state) + fused_hidden_states.append(fused_hidden_state) + return fused_hidden_states From 48eb534934fd7a63c1ad0482a300d45529875360 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 4 Feb 2025 22:26:41 +0500 Subject: [PATCH 135/151] use device in examples --- docs/source/en/model_doc/depth_pro.md | 6 ++++-- src/transformers/models/depth_pro/modeling_depth_pro.py | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index a701497caea8..00ebed799b2e 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -43,13 +43,15 @@ The DepthPro model processes an input image by first downsampling it at multiple >>> import torch >>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation +>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) >>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro") ->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro") +>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro").to(device) ->>> inputs = image_processor(images=image, return_tensors="pt") +>>> inputs = image_processor(images=image, return_tensors="pt").to(device) >>> with torch.no_grad(): ... outputs = model(**inputs) diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index d994d66eeeff..e8421ab3bcea 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -1146,8 +1146,11 @@ def forward( >>> processor = AutoImageProcessor.from_pretrained(checkpoint) >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + >>> model.to(device) + >>> # prepare image for the model - >>> inputs = processor(images=image, return_tensors="pt") + >>> inputs = processor(images=image, return_tensors="pt").to(device) >>> with torch.no_grad(): ... outputs = model(**inputs) From ba37c9167edbd50d38d2e5efd2141472ce17b00c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 12:50:28 +0500 Subject: [PATCH 136/151] update processor --- .../convert_depth_pro_weights_to_hf.py | 12 +- .../depth_pro/image_processing_depth_pro.py | 24 +- .../image_processing_depth_pro_fast.py | 373 ++++++------------ .../test_image_processing_depth_pro.py | 10 +- 4 files changed, 125 insertions(+), 294 deletions(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 07aebbe18607..feebcd7fd9c0 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -203,17 +203,7 @@ def write_model( def write_image_processor(output_dir: str): - image_processor = DepthProImageProcessorFast( - do_resize=True, - size={"height": 1536, "width": 1536}, - resample=PILImageResampling.BILINEAR, - antialias=False, - do_rescale=True, - rescale_factor=1 / 255, - do_normalize=True, - image_mean=0.5, - image_std=0.5, - ) + image_processor = DepthProImageProcessorFast() image_processor.save_pretrained(output_dir) return image_processor diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 9a9568fdde05..5871e0f764cd 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -67,9 +67,6 @@ class DepthProImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `True`): Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. @@ -94,7 +91,6 @@ def __init__( do_resize: bool = True, size: Optional[Dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, - antialias: bool = False, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -110,7 +106,6 @@ def __init__( self.do_normalize = do_normalize self.size = size self.resample = resample - self.antialias = antialias self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD @@ -120,7 +115,6 @@ def resize( image: np.ndarray, size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, - antialias: bool = False, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -135,9 +129,6 @@ def resize( Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -172,7 +163,6 @@ def resize( input=image_tensor, size=output_size, mode=pil_torch_interpolation_mapping[resample].value, - antialias=antialias, ) resized_image = resized_image.squeeze(0).numpy() return resized_image @@ -182,7 +172,6 @@ def _validate_input_arguments( do_resize: bool, size: Dict[str, int], resample: PILImageResampling, - antialias: bool, do_rescale: bool, rescale_factor: float, do_normalize: bool, @@ -190,8 +179,8 @@ def _validate_input_arguments( image_std: Union[float, List[float]], data_format: Union[str, ChannelDimension], ): - if do_resize and None in (size, resample, antialias): - raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: raise ValueError("Rescale factor must be specified if do_rescale is True.") @@ -206,7 +195,6 @@ def preprocess( do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, resample: Optional[PILImageResampling] = None, - antialias: Optional[bool] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, @@ -231,9 +219,6 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -267,7 +252,6 @@ def preprocess( do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample - antialias = antialias if antialias is not None else self.antialias rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std @@ -285,7 +269,6 @@ def preprocess( do_resize=do_resize, size=size, resample=resample, - antialias=antialias, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, @@ -321,7 +304,7 @@ def preprocess( # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) - image = self.resize(image=image, size=size, resample=resample, antialias=antialias) + image = self.resize(image=image, size=size, resample=resample) image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) else: image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) @@ -387,7 +370,6 @@ def post_process_depth_estimation( input=depth.unsqueeze(0).unsqueeze(1), size=target_size, mode=pil_torch_interpolation_mapping[self.resample].value, - antialias=self.antialias, ).squeeze() # inverse the depth diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index a56ae831960d..cc6c3feace82 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -14,26 +14,34 @@ # limitations under the License. """Fast Image processor class for DepthPro.""" -import functools from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union from ...image_processing_base import BatchFeature -from ...image_processing_utils import get_size_dict -from ...image_processing_utils_fast import BaseImageProcessorFast, SizeDict -from ...image_transforms import FusedRescaleNormalize, NumpyToTensor, Rescale +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BaseImageProcessorFast, + ChannelDimension, + get_image_size_for_max_height_width, + get_resize_output_image_size, + get_size_with_aspect_ratio, + group_images_by_shape, + reorder_images, +) from ...image_utils import ( IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, - ChannelDimension, - ImageInput, - ImageType, PILImageResampling, - get_image_type, - make_list_of_images, - pil_torch_interpolation_mapping, + SizeDict, +) +from ...utils import ( + TensorType, + add_start_docstrings, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + logging, + requires_backends, ) -from ...utils import TensorType, logging, requires_backends -from ...utils.import_utils import is_torch_available, is_torchvision_available if TYPE_CHECKING: @@ -47,268 +55,118 @@ if is_torchvision_available(): - from torchvision.transforms import Compose, Normalize, PILToTensor, Resize + from ...image_utils import pil_torch_interpolation_mapping + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + +@add_start_docstrings( + "Constructs a fast DepthPro image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) class DepthProImageProcessorFast(BaseImageProcessorFast): - r""" - Constructs a DepthPro image processor. - - Args: - do_resize (`bool`, *optional*, defaults to `True`): - Whether to resize the image's (height, width) dimensions to the specified `(size["height"], - size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method. - size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`): - Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` - method. - resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): - Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the - `preprocess` method. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. - do_rescale (`bool`, *optional*, defaults to `True`): - Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` - parameter in the `preprocess` method. - rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): - Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the - `preprocess` method. - do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` - method. - image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): - Mean to use if normalizing the image. This is a float or list of floats the length of the number of - channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. - image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): - Standard deviation to use if normalizing the image. This is a float or list of floats the length of the - number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. - """ - - model_input_names = ["pixel_values"] - _transform_params = [ - "do_resize", - "do_rescale", - "do_normalize", - "size", - "resample", - "antialias", - "rescale_factor", - "image_mean", - "image_std", - "image_type", - ] - - def __init__( + resample = PILImageResampling.BILINEAR + image_mean = IMAGENET_STANDARD_MEAN + image_std = IMAGENET_STANDARD_STD + size = {"height": 1536, "width": 1536} + do_resize = True + do_rescale = True + do_normalize = True + + # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize + def resize( self, - do_resize: bool = True, - size: Optional[Dict[str, int]] = None, - resample: PILImageResampling = PILImageResampling.BILINEAR, - antialias: bool = False, - do_rescale: bool = True, - rescale_factor: Union[int, float] = 1 / 255, - do_normalize: bool = True, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, + image: "torch.Tensor", + size: SizeDict, + interpolation: "F.InterpolationMode" = None, **kwargs, - ): - super().__init__(**kwargs) - size = size if size is not None else {"height": 1536, "width": 1536} - size = get_size_dict(size) - self.do_resize = do_resize - self.do_rescale = do_rescale - self.do_normalize = do_normalize - self.size = size - self.resample = resample - self.antialias = antialias - self.rescale_factor = rescale_factor - self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN - self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD - - def _build_transforms( - self, - do_resize: bool, - size: Dict[str, int], - resample: PILImageResampling, - antialias: bool, - do_rescale: bool, - rescale_factor: float, - do_normalize: bool, - image_mean: Union[float, List[float]], - image_std: Union[float, List[float]], - image_type: ImageType, - ) -> "Compose": + ) -> "torch.Tensor": """ - Given the input settings build the image transforms using `torchvision.transforms.Compose`. + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. """ - transforms = [] - - # All PIL and numpy values need to be converted to a torch tensor - # to keep cross compatibility with slow image processors - if image_type == ImageType.PIL: - transforms.append(PILToTensor()) - - elif image_type == ImageType.NUMPY: - transforms.append(NumpyToTensor()) - - # We can combine rescale and normalize into a single operation for speed - if do_rescale and do_normalize: - transforms.append(FusedRescaleNormalize(image_mean, image_std, rescale_factor=rescale_factor)) - elif do_rescale: - transforms.append(Rescale(rescale_factor=rescale_factor)) - elif do_normalize: - transforms.append(Normalize(image_mean, image_std)) - - # depth-pro scales the image before resizing it - if do_resize: - transforms.append( - Resize( - (size["height"], size["width"]), - interpolation=pil_torch_interpolation_mapping[resample], - antialias=antialias, - ) + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size.shortest_edge, + size.longest_edge, ) + elif size.shortest_edge: + new_size = get_resize_output_image_size( + image, + size=size.shortest_edge, + default_to_square=False, + input_data_format=ChannelDimension.FIRST, + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width) + elif size.height and size.width: + new_size = (size.height, size.width) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" + f" {size}." + ) + return F.resize(image, new_size, interpolation=interpolation, antialias=False) - return Compose(transforms) - - @functools.lru_cache(maxsize=1) - def _validate_input_arguments( + # DepthPro resizes image after rescaling and normalizing, + # which makes it different from BaseImageProcessorFast._preprocess + def _preprocess( self, - return_tensors: Union[str, TensorType], + images: List["torch.Tensor"], do_resize: bool, - size: Dict[str, int], - resample: PILImageResampling, - antialias: bool, + size: SizeDict, + interpolation: Optional["F.InterpolationMode"], + do_center_crop: bool, + crop_size: SizeDict, do_rescale: bool, rescale_factor: float, do_normalize: bool, - image_mean: Union[float, List[float]], - image_std: Union[float, List[float]], - data_format: Union[str, ChannelDimension], - image_type: ImageType, - ): - if return_tensors != "pt": - raise ValueError("Only returning PyTorch tensors is currently supported.") - - if data_format != ChannelDimension.FIRST: - raise ValueError("Only channel first data format is currently supported.") - - if do_resize and None in (size, resample, antialias): - raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + image_mean: Optional[Union[float, List[float]]], + image_std: Optional[Union[float, List[float]]], + return_tensors: Optional[Union[str, TensorType]], + ) -> BatchFeature: + # Group images by size for batched scaling + grouped_images, grouped_images_index = group_images_by_shape(images) + scaled_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + scaled_images_grouped[shape] = stacked_images + scaled_images = reorder_images(scaled_images_grouped, grouped_images_index) - if do_rescale and rescale_factor is None: - raise ValueError("Rescale factor must be specified if do_rescale is True.") + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(scaled_images) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) - if do_normalize and None in (image_mean, image_std): - raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images - def preprocess( - self, - images: ImageInput, - do_resize: Optional[bool] = None, - size: Optional[Dict[str, int]] = None, - resample: Optional[PILImageResampling] = None, - antialias: Optional[bool] = None, - do_rescale: Optional[bool] = None, - rescale_factor: Optional[float] = None, - do_normalize: Optional[bool] = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - return_tensors: Optional[Union[str, TensorType]] = "pt", - data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, - input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, - ): - """ - Preprocess an image or batch of images. - - Args: - images (`ImageInput`): - Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If - passing in images with pixel values between 0 and 1, set `do_rescale=False`. - do_resize (`bool`, *optional*, defaults to `self.do_resize`): - Whether to resize the image. - size (`Dict[str, int]`, *optional*, defaults to `self.size`): - Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after - resizing. - resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): - `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has - an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. - do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): - Whether to rescale the image values between [0 - 1]. - rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): - Rescale factor to rescale the image by if `do_rescale` is set to `True`. - do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): - Whether to normalize the image. - image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): - Image mean to use if `do_normalize` is set to `True`. - image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): - Image standard deviation to use if `do_normalize` is set to `True`. - return_tensors (`str` or `TensorType`, *optional*): - The type of tensors to return. Only "pt" is supported - data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): - The channel dimension format for the output image. The following formats are currently supported: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - input_data_format (`ChannelDimension` or `str`, *optional*): - The channel dimension format for the input image. If unset, the channel dimension format is inferred - from the input image. Can be one of: - - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - """ - do_resize = do_resize if do_resize is not None else self.do_resize - do_rescale = do_rescale if do_rescale is not None else self.do_rescale - do_normalize = do_normalize if do_normalize is not None else self.do_normalize - resample = resample if resample is not None else self.resample - antialias = antialias if antialias is not None else self.antialias - rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor - image_mean = image_mean if image_mean is not None else self.image_mean - image_std = image_std if image_std is not None else self.image_std - size = size if size is not None else self.size - # Make hashable for cache - size = SizeDict(**size) - image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean - image_std = tuple(image_std) if isinstance(image_std, list) else image_std - - images = make_list_of_images(images) - image_type = get_image_type(images[0]) - - if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: - raise ValueError(f"Unsupported input image type {image_type}") - - self._validate_input_arguments( - do_resize=do_resize, - size=size, - resample=resample, - antialias=antialias, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - return_tensors=return_tensors, - data_format=data_format, - image_type=image_type, - ) - - transforms = self.get_transforms( - do_resize=do_resize, - do_rescale=do_rescale, - do_normalize=do_normalize, - size=size, - resample=resample, - antialias=antialias, - rescale_factor=rescale_factor, - image_mean=image_mean, - image_std=image_std, - image_type=image_type, - ) - transformed_images = [transforms(image) for image in images] - - data = {"pixel_values": torch.stack(transformed_images, dim=0)} - return BatchFeature(data, tensor_type=return_tensors) + return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) # Copied from transformers.models.depth_pro.image_processing_depth_pro.DepthProImageProcessor.post_process_depth_estimation def post_process_depth_estimation( @@ -367,7 +225,6 @@ def post_process_depth_estimation( input=depth.unsqueeze(0).unsqueeze(1), size=target_size, mode=pil_torch_interpolation_mapping[self.resample].value, - antialias=self.antialias, ).squeeze() # inverse the depth diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index 0e830698c0a1..e754e53e009f 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -16,14 +16,17 @@ import unittest -from transformers.file_utils import is_vision_available from transformers.testing_utils import is_flaky, require_torch, require_vision +from transformers.utils import is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs if is_vision_available(): - from transformers import DepthProImageProcessor, DepthProImageProcessorFast + from transformers import DepthProImageProcessor + + if is_torchvision_available(): + from transformers import DepthProImageProcessorFast class DepthProImageProcessingTester(unittest.TestCase): @@ -83,7 +86,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F @require_vision class DepthProImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): image_processing_class = DepthProImageProcessor if is_vision_available() else None - fast_image_processing_class = DepthProImageProcessorFast if is_vision_available() else None + fast_image_processing_class = DepthProImageProcessorFast if is_torchvision_available() else None def setUp(self): super().setUp() @@ -103,7 +106,6 @@ def test_image_processor_properties(self): self.assertTrue(hasattr(image_processing, "do_rescale")) self.assertTrue(hasattr(image_processing, "rescale_factor")) self.assertTrue(hasattr(image_processing, "resample")) - self.assertTrue(hasattr(image_processing, "antialias")) def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict) From 949ecb969a8f2459bc2975856ff808240f95ff98 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 12:58:10 +0500 Subject: [PATCH 137/151] ruff fixes --- .../models/depth_pro/convert_depth_pro_weights_to_hf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index feebcd7fd9c0..f2cfc0bdd758 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -25,7 +25,6 @@ DepthProForDepthEstimation, DepthProImageProcessorFast, ) -from transformers.image_utils import PILImageResampling # fmt: off From 0e2861d1af01fbc33ac1d2101c2b03a01f42bfab Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 13:34:50 +0500 Subject: [PATCH 138/151] add do_rescale in image_processor_dict --- tests/models/depth_pro/test_image_processing_depth_pro.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index e754e53e009f..b30931a86cdb 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -40,6 +40,7 @@ def __init__( max_resolution=400, do_resize=True, size=None, + do_rescale=True, do_normalize=True, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5], @@ -54,6 +55,7 @@ def __init__( self.max_resolution = max_resolution self.do_resize = do_resize self.size = size + self.do_rescale = do_rescale self.do_normalize = do_normalize self.image_mean = image_mean self.image_std = image_std @@ -62,6 +64,7 @@ def prepare_image_processor_dict(self): return { "image_mean": self.image_mean, "image_std": self.image_std, + "do_rescale": self.do_rescale, "do_normalize": self.do_normalize, "do_resize": self.do_resize, "size": self.size, From a6efedb8d80245496cefb89c77ce0a9732888a9c Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 13:36:09 +0500 Subject: [PATCH 139/151] skip test: test_fast_is_faster_than_slow --- tests/models/depth_pro/test_image_processing_depth_pro.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index b30931a86cdb..13f329018acd 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -117,8 +117,8 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) - @is_flaky( - description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920", + @unittest.skip( + reason="both processors (fast and slow) use torch for resizing, check: https://github.com/huggingface/transformers/issues/34920", ) def test_fast_is_faster_than_slow(self): - super().test_fast_is_faster_than_slow() + pass From 4d8f927a86ce6ad4fcde12daeb88ec7052d2749a Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 13:38:15 +0500 Subject: [PATCH 140/151] ruff formatting --- tests/models/depth_pro/test_image_processing_depth_pro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index 13f329018acd..434741b13e1b 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -16,7 +16,7 @@ import unittest -from transformers.testing_utils import is_flaky, require_torch, require_vision +from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs From dd8de27023dbe0d4a294fe598366a60cbbd3449f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 14:03:54 +0500 Subject: [PATCH 141/151] DepthProImageProcessorFast in other files --- src/transformers/__init__.py | 2 ++ src/transformers/utils/dummy_torchvision_objects.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index a0144429364b..d9db0a0fd6e3 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1315,6 +1315,7 @@ _import_structure["models.convnext"].append("ConvNextImageProcessorFast") _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast") _import_structure["models.deit"].append("DeiTImageProcessorFast") + _import_structure["models.depth_pro"].append("DepthProImageProcessorFast") _import_structure["models.detr"].append("DetrImageProcessorFast") _import_structure["models.llava"].append("LlavaImageProcessorFast") _import_structure["models.llava_next"].append("LlavaNextImageProcessorFast") @@ -6466,6 +6467,7 @@ from .models.convnext import ConvNextImageProcessorFast from .models.deformable_detr import DeformableDetrImageProcessorFast from .models.deit import DeiTImageProcessorFast + from .models.depth_pro import DepthProImageProcessorFast from .models.detr import DetrImageProcessorFast from .models.llava import LlavaImageProcessorFast from .models.llava_next import LlavaNextImageProcessorFast diff --git a/src/transformers/utils/dummy_torchvision_objects.py b/src/transformers/utils/dummy_torchvision_objects.py index f1b75efc2071..87b60fbc0463 100644 --- a/src/transformers/utils/dummy_torchvision_objects.py +++ b/src/transformers/utils/dummy_torchvision_objects.py @@ -44,6 +44,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torchvision"]) +class DepthProImageProcessorFast(metaclass=DummyObject): + _backends = ["torchvision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torchvision"]) + + class DetrImageProcessorFast(metaclass=DummyObject): _backends = ["torchvision"] From 5caa0bd8f9f7463b98410c04e6cfe8fef3adee18 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 18:07:45 +0500 Subject: [PATCH 142/151] revert antialias removal --- .../depth_pro/image_processing_depth_pro.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 5871e0f764cd..9a9568fdde05 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -67,6 +67,9 @@ class DepthProImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `True`): Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. @@ -91,6 +94,7 @@ def __init__( do_resize: bool = True, size: Optional[Dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -106,6 +110,7 @@ def __init__( self.do_normalize = do_normalize self.size = size self.resample = resample + self.antialias = antialias self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD @@ -115,6 +120,7 @@ def resize( image: np.ndarray, size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, + antialias: bool = False, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -129,6 +135,9 @@ def resize( Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -163,6 +172,7 @@ def resize( input=image_tensor, size=output_size, mode=pil_torch_interpolation_mapping[resample].value, + antialias=antialias, ) resized_image = resized_image.squeeze(0).numpy() return resized_image @@ -172,6 +182,7 @@ def _validate_input_arguments( do_resize: bool, size: Dict[str, int], resample: PILImageResampling, + antialias: bool, do_rescale: bool, rescale_factor: float, do_normalize: bool, @@ -179,8 +190,8 @@ def _validate_input_arguments( image_std: Union[float, List[float]], data_format: Union[str, ChannelDimension], ): - if do_resize and None in (size, resample): - raise ValueError("Size and resample must be specified if do_resize is True.") + if do_resize and None in (size, resample, antialias): + raise ValueError("Size, resample and antialias must be specified if do_resize is True.") if do_rescale and rescale_factor is None: raise ValueError("Rescale factor must be specified if do_rescale is True.") @@ -195,6 +206,7 @@ def preprocess( do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, resample: Optional[PILImageResampling] = None, + antialias: Optional[bool] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, @@ -219,6 +231,9 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `False`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -252,6 +267,7 @@ def preprocess( do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample + antialias = antialias if antialias is not None else self.antialias rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std @@ -269,6 +285,7 @@ def preprocess( do_resize=do_resize, size=size, resample=resample, + antialias=antialias, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, @@ -304,7 +321,7 @@ def preprocess( # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) - image = self.resize(image=image, size=size, resample=resample) + image = self.resize(image=image, size=size, resample=resample, antialias=antialias) image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) else: image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) @@ -370,6 +387,7 @@ def post_process_depth_estimation( input=depth.unsqueeze(0).unsqueeze(1), size=target_size, mode=pil_torch_interpolation_mapping[self.resample].value, + antialias=self.antialias, ).squeeze() # inverse the depth From 3ae1134780ae236872985523d9c0a444eabcc179 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 18:45:35 +0500 Subject: [PATCH 143/151] add antialias in BaseImageProcessorFast --- .../image_processing_utils_fast.py | 25 +++++++- .../image_processing_depth_pro_fast.py | 62 +++---------------- 2 files changed, 31 insertions(+), 56 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index cb7d1c46aa79..20dc06e53b3b 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -132,6 +132,7 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False): size: Optional[Dict[str, int]] default_to_square: Optional[bool] resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] + antialias: Optional[bool] do_center_crop: Optional[bool] crop_size: Optional[Dict[str, int]] do_rescale: Optional[bool] @@ -163,6 +164,9 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa resample (`PILImageResampling`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `resample` parameter in the `preprocess` method. + antialias (`bool`, *optional*, defaults to `True`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the `preprocess` method. @@ -203,6 +207,9 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only has an effect if `do_resize` is set to `True`. + antialias (`bool`, *optional*, defaults to `True`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): Whether to center crop the image. crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): @@ -243,6 +250,7 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa ) class BaseImageProcessorFast(BaseImageProcessor): resample = None + antialias = None image_mean = None image_std = None size = None @@ -283,6 +291,7 @@ def resize( image: "torch.Tensor", size: SizeDict, interpolation: "F.InterpolationMode" = None, + antialias: bool = True, **kwargs, ) -> "torch.Tensor": """ @@ -295,11 +304,15 @@ def resize( Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + antialias (`bool`, *optional*, defaults to `True`): + Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with + bilinear or bicubic modes and it is ignored otherwise. Returns: `torch.Tensor`: The resized image. """ interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + antialias = antialias if antialias is not None else True if size.shortest_edge and size.longest_edge: # Resize the image so that the shortest edge or the longest edge is of the given size # while maintaining the aspect ratio of the original image. @@ -324,7 +337,7 @@ def resize( "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" f" {size}." ) - return F.resize(image, new_size, interpolation=interpolation) + return F.resize(image, new_size, interpolation=interpolation, antialias=antialias) def rescale( self, @@ -578,6 +591,7 @@ def preprocess( image_std = kwargs.pop("image_std") data_format = kwargs.pop("data_format") resample = kwargs.pop("resample") + antialias = kwargs.pop("antialias") # Make hashable for cache size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None @@ -606,6 +620,7 @@ def preprocess( size=size, crop_size=crop_size, interpolation=interpolation, + antialias=antialias, image_mean=image_mean, image_std=image_std, **kwargs, @@ -617,6 +632,7 @@ def _preprocess( do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], + antialias: Optional[bool], do_center_crop: bool, crop_size: SizeDict, do_rescale: bool, @@ -631,7 +647,12 @@ def _preprocess( resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: - stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + antialias=antialias, + ) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index cc6c3feace82..2b9870c6dc0a 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -20,10 +20,6 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast, - ChannelDimension, - get_image_size_for_max_height_width, - get_resize_output_image_size, - get_size_with_aspect_ratio, group_images_by_shape, reorder_images, ) @@ -69,6 +65,7 @@ ) class DepthProImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR + antialias = False image_mean = IMAGENET_STANDARD_MEAN image_std = IMAGENET_STANDARD_STD size = {"height": 1536, "width": 1536} @@ -76,55 +73,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True - # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize - def resize( - self, - image: "torch.Tensor", - size: SizeDict, - interpolation: "F.InterpolationMode" = None, - **kwargs, - ) -> "torch.Tensor": - """ - Resize an image to `(size["height"], size["width"])`. - - Args: - image (`torch.Tensor`): - Image to resize. - size (`SizeDict`): - Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. - resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): - `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. - - Returns: - `torch.Tensor`: The resized image. - """ - interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR - if size.shortest_edge and size.longest_edge: - # Resize the image so that the shortest edge or the longest edge is of the given size - # while maintaining the aspect ratio of the original image. - new_size = get_size_with_aspect_ratio( - image.size()[-2:], - size.shortest_edge, - size.longest_edge, - ) - elif size.shortest_edge: - new_size = get_resize_output_image_size( - image, - size=size.shortest_edge, - default_to_square=False, - input_data_format=ChannelDimension.FIRST, - ) - elif size.max_height and size.max_width: - new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width) - elif size.height and size.width: - new_size = (size.height, size.width) - else: - raise ValueError( - "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" - f" {size}." - ) - return F.resize(image, new_size, interpolation=interpolation, antialias=False) - # DepthPro resizes image after rescaling and normalizing, # which makes it different from BaseImageProcessorFast._preprocess def _preprocess( @@ -133,6 +81,7 @@ def _preprocess( do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], + antialias: bool, do_center_crop: bool, crop_size: SizeDict, do_rescale: bool, @@ -160,7 +109,12 @@ def _preprocess( resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: - stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + antialias=antialias, + ) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) From 8372ad9d7437ddcac07a4d0578230e7974c154a2 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 19:40:29 +0500 Subject: [PATCH 144/151] Revert "revert antialias removal" This reverts commit 5caa0bd8f9f7463b98410c04e6cfe8fef3adee18. --- .../depth_pro/image_processing_depth_pro.py | 24 +++---------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro.py b/src/transformers/models/depth_pro/image_processing_depth_pro.py index 9a9568fdde05..5871e0f764cd 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro.py @@ -67,9 +67,6 @@ class DepthProImageProcessor(BaseImageProcessor): resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the `preprocess` method. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `True`): Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` parameter in the `preprocess` method. @@ -94,7 +91,6 @@ def __init__( do_resize: bool = True, size: Optional[Dict[str, int]] = None, resample: PILImageResampling = PILImageResampling.BILINEAR, - antialias: bool = False, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -110,7 +106,6 @@ def __init__( self.do_normalize = do_normalize self.size = size self.resample = resample - self.antialias = antialias self.rescale_factor = rescale_factor self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD @@ -120,7 +115,6 @@ def resize( image: np.ndarray, size: Dict[str, int], resample: PILImageResampling = PILImageResampling.BILINEAR, - antialias: bool = False, data_format: Optional[Union[str, ChannelDimension]] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs, @@ -135,9 +129,6 @@ def resize( Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. data_format (`ChannelDimension` or `str`, *optional*): The channel dimension format for the output image. If unset, the channel dimension format of the input image is used. Can be one of: @@ -172,7 +163,6 @@ def resize( input=image_tensor, size=output_size, mode=pil_torch_interpolation_mapping[resample].value, - antialias=antialias, ) resized_image = resized_image.squeeze(0).numpy() return resized_image @@ -182,7 +172,6 @@ def _validate_input_arguments( do_resize: bool, size: Dict[str, int], resample: PILImageResampling, - antialias: bool, do_rescale: bool, rescale_factor: float, do_normalize: bool, @@ -190,8 +179,8 @@ def _validate_input_arguments( image_std: Union[float, List[float]], data_format: Union[str, ChannelDimension], ): - if do_resize and None in (size, resample, antialias): - raise ValueError("Size, resample and antialias must be specified if do_resize is True.") + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") if do_rescale and rescale_factor is None: raise ValueError("Rescale factor must be specified if do_rescale is True.") @@ -206,7 +195,6 @@ def preprocess( do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, resample: Optional[PILImageResampling] = None, - antialias: Optional[bool] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, @@ -231,9 +219,6 @@ def preprocess( resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`): `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `False`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): Whether to rescale the image values between [0 - 1]. rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): @@ -267,7 +252,6 @@ def preprocess( do_rescale = do_rescale if do_rescale is not None else self.do_rescale do_normalize = do_normalize if do_normalize is not None else self.do_normalize resample = resample if resample is not None else self.resample - antialias = antialias if antialias is not None else self.antialias rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std @@ -285,7 +269,6 @@ def preprocess( do_resize=do_resize, size=size, resample=resample, - antialias=antialias, do_rescale=do_rescale, rescale_factor=rescale_factor, do_normalize=do_normalize, @@ -321,7 +304,7 @@ def preprocess( # uses torch interpolation which requires ChannelDimension.FIRST if do_resize: image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format) - image = self.resize(image=image, size=size, resample=resample, antialias=antialias) + image = self.resize(image=image, size=size, resample=resample) image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST) else: image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) @@ -387,7 +370,6 @@ def post_process_depth_estimation( input=depth.unsqueeze(0).unsqueeze(1), size=target_size, mode=pil_torch_interpolation_mapping[self.resample].value, - antialias=self.antialias, ).squeeze() # inverse the depth From 666f3b73616ed5c0cd16f42360e4e2018e524a1f Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 19:40:40 +0500 Subject: [PATCH 145/151] Revert "add antialias in BaseImageProcessorFast" This reverts commit 3ae1134780ae236872985523d9c0a444eabcc179. --- .../image_processing_utils_fast.py | 25 +------- .../image_processing_depth_pro_fast.py | 62 ++++++++++++++++--- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index 20dc06e53b3b..cb7d1c46aa79 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -132,7 +132,6 @@ class DefaultFastImageProcessorInitKwargs(TypedDict, total=False): size: Optional[Dict[str, int]] default_to_square: Optional[bool] resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] - antialias: Optional[bool] do_center_crop: Optional[bool] crop_size: Optional[Dict[str, int]] do_rescale: Optional[bool] @@ -164,9 +163,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa resample (`PILImageResampling`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `resample` parameter in the `preprocess` method. - antialias (`bool`, *optional*, defaults to `True`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the `preprocess` method. @@ -207,9 +203,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`): Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only has an effect if `do_resize` is set to `True`. - antialias (`bool`, *optional*, defaults to `True`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): Whether to center crop the image. crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): @@ -250,7 +243,6 @@ class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwa ) class BaseImageProcessorFast(BaseImageProcessor): resample = None - antialias = None image_mean = None image_std = None size = None @@ -291,7 +283,6 @@ def resize( image: "torch.Tensor", size: SizeDict, interpolation: "F.InterpolationMode" = None, - antialias: bool = True, **kwargs, ) -> "torch.Tensor": """ @@ -304,15 +295,11 @@ def resize( Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. - antialias (`bool`, *optional*, defaults to `True`): - Whether to apply an anti-aliasing filter when resizing the image. It only affects tensors with - bilinear or bicubic modes and it is ignored otherwise. Returns: `torch.Tensor`: The resized image. """ interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR - antialias = antialias if antialias is not None else True if size.shortest_edge and size.longest_edge: # Resize the image so that the shortest edge or the longest edge is of the given size # while maintaining the aspect ratio of the original image. @@ -337,7 +324,7 @@ def resize( "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" f" {size}." ) - return F.resize(image, new_size, interpolation=interpolation, antialias=antialias) + return F.resize(image, new_size, interpolation=interpolation) def rescale( self, @@ -591,7 +578,6 @@ def preprocess( image_std = kwargs.pop("image_std") data_format = kwargs.pop("data_format") resample = kwargs.pop("resample") - antialias = kwargs.pop("antialias") # Make hashable for cache size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None @@ -620,7 +606,6 @@ def preprocess( size=size, crop_size=crop_size, interpolation=interpolation, - antialias=antialias, image_mean=image_mean, image_std=image_std, **kwargs, @@ -632,7 +617,6 @@ def _preprocess( do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], - antialias: Optional[bool], do_center_crop: bool, crop_size: SizeDict, do_rescale: bool, @@ -647,12 +631,7 @@ def _preprocess( resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: - stacked_images = self.resize( - image=stacked_images, - size=size, - interpolation=interpolation, - antialias=antialias, - ) + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index 2b9870c6dc0a..cc6c3feace82 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -20,6 +20,10 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast, + ChannelDimension, + get_image_size_for_max_height_width, + get_resize_output_image_size, + get_size_with_aspect_ratio, group_images_by_shape, reorder_images, ) @@ -65,7 +69,6 @@ ) class DepthProImageProcessorFast(BaseImageProcessorFast): resample = PILImageResampling.BILINEAR - antialias = False image_mean = IMAGENET_STANDARD_MEAN image_std = IMAGENET_STANDARD_STD size = {"height": 1536, "width": 1536} @@ -73,6 +76,55 @@ class DepthProImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True + # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + interpolation: "F.InterpolationMode" = None, + **kwargs, + ) -> "torch.Tensor": + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size.shortest_edge, + size.longest_edge, + ) + elif size.shortest_edge: + new_size = get_resize_output_image_size( + image, + size=size.shortest_edge, + default_to_square=False, + input_data_format=ChannelDimension.FIRST, + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width) + elif size.height and size.width: + new_size = (size.height, size.width) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" + f" {size}." + ) + return F.resize(image, new_size, interpolation=interpolation, antialias=False) + # DepthPro resizes image after rescaling and normalizing, # which makes it different from BaseImageProcessorFast._preprocess def _preprocess( @@ -81,7 +133,6 @@ def _preprocess( do_resize: bool, size: SizeDict, interpolation: Optional["F.InterpolationMode"], - antialias: bool, do_center_crop: bool, crop_size: SizeDict, do_rescale: bool, @@ -109,12 +160,7 @@ def _preprocess( resized_images_grouped = {} for shape, stacked_images in grouped_images.items(): if do_resize: - stacked_images = self.resize( - image=stacked_images, - size=size, - interpolation=interpolation, - antialias=antialias, - ) + stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) resized_images_grouped[shape] = stacked_images resized_images = reorder_images(resized_images_grouped, grouped_images_index) From 41180e37e17035eae3c1fe3da6e8bfbe583713d1 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 19:55:44 +0500 Subject: [PATCH 146/151] update processor for grouping and antialias --- .../image_processing_utils_fast.py | 3 +- .../image_processing_depth_pro_fast.py | 79 +++---------------- 2 files changed, 13 insertions(+), 69 deletions(-) diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py index cb7d1c46aa79..d21d35212144 100644 --- a/src/transformers/image_processing_utils_fast.py +++ b/src/transformers/image_processing_utils_fast.py @@ -283,6 +283,7 @@ def resize( image: "torch.Tensor", size: SizeDict, interpolation: "F.InterpolationMode" = None, + antialias: bool = True, **kwargs, ) -> "torch.Tensor": """ @@ -324,7 +325,7 @@ def resize( "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" f" {size}." ) - return F.resize(image, new_size, interpolation=interpolation) + return F.resize(image, new_size, interpolation=interpolation, antialias=antialias) def rescale( self, diff --git a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py index cc6c3feace82..43a23bf10b5e 100644 --- a/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py +++ b/src/transformers/models/depth_pro/image_processing_depth_pro_fast.py @@ -20,10 +20,6 @@ from ...image_processing_utils_fast import ( BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast, - ChannelDimension, - get_image_size_for_max_height_width, - get_resize_output_image_size, - get_size_with_aspect_ratio, group_images_by_shape, reorder_images, ) @@ -76,55 +72,6 @@ class DepthProImageProcessorFast(BaseImageProcessorFast): do_rescale = True do_normalize = True - # Only difference with BaseImageProcessorFast.resize is that `antialias=False` in F.resize - def resize( - self, - image: "torch.Tensor", - size: SizeDict, - interpolation: "F.InterpolationMode" = None, - **kwargs, - ) -> "torch.Tensor": - """ - Resize an image to `(size["height"], size["width"])`. - - Args: - image (`torch.Tensor`): - Image to resize. - size (`SizeDict`): - Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. - resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): - `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. - - Returns: - `torch.Tensor`: The resized image. - """ - interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR - if size.shortest_edge and size.longest_edge: - # Resize the image so that the shortest edge or the longest edge is of the given size - # while maintaining the aspect ratio of the original image. - new_size = get_size_with_aspect_ratio( - image.size()[-2:], - size.shortest_edge, - size.longest_edge, - ) - elif size.shortest_edge: - new_size = get_resize_output_image_size( - image, - size=size.shortest_edge, - default_to_square=False, - input_data_format=ChannelDimension.FIRST, - ) - elif size.max_height and size.max_width: - new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width) - elif size.height and size.width: - new_size = (size.height, size.width) - else: - raise ValueError( - "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got" - f" {size}." - ) - return F.resize(image, new_size, interpolation=interpolation, antialias=False) - # DepthPro resizes image after rescaling and normalizing, # which makes it different from BaseImageProcessorFast._preprocess def _preprocess( @@ -144,27 +91,23 @@ def _preprocess( ) -> BatchFeature: # Group images by size for batched scaling grouped_images, grouped_images_index = group_images_by_shape(images) - scaled_images_grouped = {} + processed_images_grouped = {} for shape, stacked_images in grouped_images.items(): - if do_center_crop: - stacked_images = self.center_crop(stacked_images, crop_size) # Fused rescale and normalize stacked_images = self.rescale_and_normalize( stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std ) - scaled_images_grouped[shape] = stacked_images - scaled_images = reorder_images(scaled_images_grouped, grouped_images_index) - - # Group images by size for batched resizing - grouped_images, grouped_images_index = group_images_by_shape(scaled_images) - resized_images_grouped = {} - for shape, stacked_images in grouped_images.items(): if do_resize: - stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation) - resized_images_grouped[shape] = stacked_images - resized_images = reorder_images(resized_images_grouped, grouped_images_index) - - processed_images = torch.stack(resized_images, dim=0) if return_tensors else resized_images + stacked_images = self.resize( + image=stacked_images, + size=size, + interpolation=interpolation, + antialias=False, + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors) From 1265b12d2258af67a512cdba7651b45cac8c17f5 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 20:01:50 +0500 Subject: [PATCH 147/151] try test_fast_is_faster_than_slow without "skip" or "flanky" --- tests/models/depth_pro/test_image_processing_depth_pro.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index 434741b13e1b..5827512478d1 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -116,9 +116,3 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) - - @unittest.skip( - reason="both processors (fast and slow) use torch for resizing, check: https://github.com/huggingface/transformers/issues/34920", - ) - def test_fast_is_faster_than_slow(self): - pass From 4dc850fcb77be76dd7f2dc0fb3911beadaa8f751 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Wed, 5 Feb 2025 23:28:29 +0500 Subject: [PATCH 148/151] update checkpoint --- docs/source/en/model_doc/depth_pro.md | 10 +++++----- .../depth_pro/convert_depth_pro_weights_to_hf.py | 2 +- .../models/depth_pro/modeling_depth_pro.py | 4 ++-- tests/models/depth_pro/test_modeling_depth_pro.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 00ebed799b2e..e84f5a41a355 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) ->>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/DepthPro") ->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro").to(device) +>>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/depth-pro-hf") +>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf").to(device) >>> inputs = image_processor(images=image, return_tensors="pt").to(device) @@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model. -The pretrained model at checkpoint `geetu040/DepthPro` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. +The pretrained model at checkpoint `geetu040/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. ```py >>> from transformers import DepthProForDepthEstimation ->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", use_fov_model=False) +>>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", use_fov_model=False) ``` To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config. @@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available, ```py from transformers import DepthProForDepthEstimation -model = DepthProForDepthEstimation.from_pretrained("geetu040/DepthPro", attn_implementation="sdpa", torch_dtype=torch.float16) +model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16) ``` For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index f2cfc0bdd758..9a41a6aa027a 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -229,7 +229,7 @@ def main(): ) parser.add_argument( "--hub_repo_id", - default="geetu040/DepthPro", + default="geetu040/depth-pro-hf", help="Huggingface hub repo to write the converted model and processor", ) args = parser.parse_args() diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index e8421ab3bcea..319b782a5123 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -752,7 +752,7 @@ def forward( >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "geetu040/DepthPro" + >>> checkpoint = "geetu040/depth-pro-hf" >>> processor = AutoProcessor.from_pretrained(checkpoint) >>> model = DepthProModel.from_pretrained(checkpoint) @@ -1142,7 +1142,7 @@ def forward( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "geetu040/DepthPro" + >>> checkpoint = "geetu040/depth-pro-hf" >>> processor = AutoImageProcessor.from_pretrained(checkpoint) >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 1e4ceadbd4eb..fc0d033c0d43 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): @slow def test_model_from_pretrained(self): - model_path = "geetu040/DepthPro" + model_path = "geetu040/depth-pro-hf" model = DepthProModel.from_pretrained(model_path) self.assertIsNotNone(model) @@ -345,7 +345,7 @@ def prepare_img(): @slow class DepthProModelIntegrationTest(unittest.TestCase): def test_inference_depth_estimation(self): - model_path = "geetu040/DepthPro" + model_path = "geetu040/depth-pro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) config = model.config @@ -378,7 +378,7 @@ def test_inference_depth_estimation(self): torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4) def test_post_processing_depth_estimation(self): - model_path = "geetu040/DepthPro" + model_path = "geetu040/depth-pro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path) From 592648c11687b2366d48f2fa721a283b6d052874 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 6 Feb 2025 08:24:09 +0500 Subject: [PATCH 149/151] update checkpoint --- docs/source/en/model_doc/depth_pro.md | 10 +++++----- .../depth_pro/convert_depth_pro_weights_to_hf.py | 2 +- .../models/depth_pro/modeling_depth_pro.py | 4 ++-- tests/models/depth_pro/test_modeling_depth_pro.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index e84f5a41a355..9ac15c6081d4 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) ->>> image_processor = DepthProImageProcessorFast.from_pretrained("geetu040/depth-pro-hf") ->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf").to(device) +>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/depth-pro-hf") +>>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf").to(device) >>> inputs = image_processor(images=image, return_tensors="pt").to(device) @@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model. -The pretrained model at checkpoint `geetu040/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. +The pretrained model at checkpoint `apple/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. ```py >>> from transformers import DepthProForDepthEstimation ->>> model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", use_fov_model=False) +>>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", use_fov_model=False) ``` To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config. @@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available, ```py from transformers import DepthProForDepthEstimation -model = DepthProForDepthEstimation.from_pretrained("geetu040/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16) +model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16) ``` For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index 9a41a6aa027a..ec8732f80616 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -229,7 +229,7 @@ def main(): ) parser.add_argument( "--hub_repo_id", - default="geetu040/depth-pro-hf", + default="apple/depth-pro-hf", help="Huggingface hub repo to write the converted model and processor", ) args = parser.parse_args() diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 319b782a5123..3ba78dc2ad67 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -752,7 +752,7 @@ def forward( >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "geetu040/depth-pro-hf" + >>> checkpoint = "apple/depth-pro-hf" >>> processor = AutoProcessor.from_pretrained(checkpoint) >>> model = DepthProModel.from_pretrained(checkpoint) @@ -1142,7 +1142,7 @@ def forward( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "geetu040/depth-pro-hf" + >>> checkpoint = "apple/depth-pro-hf" >>> processor = AutoImageProcessor.from_pretrained(checkpoint) >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index fc0d033c0d43..62b7465cac54 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): @slow def test_model_from_pretrained(self): - model_path = "geetu040/depth-pro-hf" + model_path = "apple/depth-pro-hf" model = DepthProModel.from_pretrained(model_path) self.assertIsNotNone(model) @@ -345,7 +345,7 @@ def prepare_img(): @slow class DepthProModelIntegrationTest(unittest.TestCase): def test_inference_depth_estimation(self): - model_path = "geetu040/depth-pro-hf" + model_path = "apple/depth-pro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) config = model.config @@ -378,7 +378,7 @@ def test_inference_depth_estimation(self): torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4) def test_post_processing_depth_estimation(self): - model_path = "geetu040/depth-pro-hf" + model_path = "apple/depth-pro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path) From 162f14166f2ebd36772967da51068cad846bc8e3 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Thu, 6 Feb 2025 16:33:02 +0500 Subject: [PATCH 150/151] use @is_flanky for processor test --- tests/models/depth_pro/test_image_processing_depth_pro.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/models/depth_pro/test_image_processing_depth_pro.py b/tests/models/depth_pro/test_image_processing_depth_pro.py index 5827512478d1..b30931a86cdb 100644 --- a/tests/models/depth_pro/test_image_processing_depth_pro.py +++ b/tests/models/depth_pro/test_image_processing_depth_pro.py @@ -16,7 +16,7 @@ import unittest -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import is_flaky, require_torch, require_vision from transformers.utils import is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -116,3 +116,9 @@ def test_image_processor_from_dict_with_kwargs(self): image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size=42) self.assertEqual(image_processor.size, {"height": 42, "width": 42}) + + @is_flaky( + description="fast and slow, both processors use torch implementation, see: https://github.com/huggingface/transformers/issues/34920", + ) + def test_fast_is_faster_than_slow(self): + super().test_fast_is_faster_than_slow() From 4b762390086fde562f93ab04bf64b9f251e4c9de Mon Sep 17 00:00:00 2001 From: geetu040 Date: Fri, 7 Feb 2025 23:15:59 +0500 Subject: [PATCH 151/151] update checkpoint to "apple/DepthPro-hf" --- docs/source/en/model_doc/depth_pro.md | 10 +++++----- .../depth_pro/convert_depth_pro_weights_to_hf.py | 2 +- .../models/depth_pro/modeling_depth_pro.py | 4 ++-- tests/models/depth_pro/test_modeling_depth_pro.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md index 9ac15c6081d4..2447b7d93dd5 100644 --- a/docs/source/en/model_doc/depth_pro.md +++ b/docs/source/en/model_doc/depth_pro.md @@ -48,8 +48,8 @@ The DepthPro model processes an input image by first downsampling it at multiple >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' >>> image = Image.open(requests.get(url, stream=True).raw) ->>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/depth-pro-hf") ->>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf").to(device) +>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf") +>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device) >>> inputs = image_processor(images=image, return_tensors="pt").to(device) @@ -96,10 +96,10 @@ The network is supplemented with a focal length estimation head. A small convolu The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model. -The pretrained model at checkpoint `apple/depth-pro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. +The pretrained model at checkpoint `apple/DepthPro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation. ```py >>> from transformers import DepthProForDepthEstimation ->>> model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", use_fov_model=False) +>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", use_fov_model=False) ``` To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config. @@ -129,7 +129,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available, ```py from transformers import DepthProForDepthEstimation -model = DepthProForDepthEstimation.from_pretrained("apple/depth-pro-hf", attn_implementation="sdpa", torch_dtype=torch.float16) +model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", attn_implementation="sdpa", torch_dtype=torch.float16) ``` For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). diff --git a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py index ec8732f80616..b24c6a5174f0 100644 --- a/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py +++ b/src/transformers/models/depth_pro/convert_depth_pro_weights_to_hf.py @@ -229,7 +229,7 @@ def main(): ) parser.add_argument( "--hub_repo_id", - default="apple/depth-pro-hf", + default="apple/DepthPro-hf", help="Huggingface hub repo to write the converted model and processor", ) args = parser.parse_args() diff --git a/src/transformers/models/depth_pro/modeling_depth_pro.py b/src/transformers/models/depth_pro/modeling_depth_pro.py index 3ba78dc2ad67..67715723d133 100644 --- a/src/transformers/models/depth_pro/modeling_depth_pro.py +++ b/src/transformers/models/depth_pro/modeling_depth_pro.py @@ -752,7 +752,7 @@ def forward( >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "apple/depth-pro-hf" + >>> checkpoint = "apple/DepthPro-hf" >>> processor = AutoProcessor.from_pretrained(checkpoint) >>> model = DepthProModel.from_pretrained(checkpoint) @@ -1142,7 +1142,7 @@ def forward( >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) - >>> checkpoint = "apple/depth-pro-hf" + >>> checkpoint = "apple/DepthPro-hf" >>> processor = AutoImageProcessor.from_pretrained(checkpoint) >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint) diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 62b7465cac54..44529270fd94 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -329,7 +329,7 @@ def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): @slow def test_model_from_pretrained(self): - model_path = "apple/depth-pro-hf" + model_path = "apple/DepthPro-hf" model = DepthProModel.from_pretrained(model_path) self.assertIsNotNone(model) @@ -345,7 +345,7 @@ def prepare_img(): @slow class DepthProModelIntegrationTest(unittest.TestCase): def test_inference_depth_estimation(self): - model_path = "apple/depth-pro-hf" + model_path = "apple/DepthPro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path).to(torch_device) config = model.config @@ -378,7 +378,7 @@ def test_inference_depth_estimation(self): torch.testing.assert_close(outputs.field_of_view, expected_slice, atol=1e-4, rtol=1e-4) def test_post_processing_depth_estimation(self): - model_path = "apple/depth-pro-hf" + model_path = "apple/DepthPro-hf" image_processor = DepthProImageProcessor.from_pretrained(model_path) model = DepthProForDepthEstimation.from_pretrained(model_path)