datalab-to
diff --git a/‎poetry.lock‎
Lines changed: 42 additions & 40 deletions b/‎poetry.lock‎
Lines changed: 42 additions & 40 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎surya/common/adetr/decoder.py‎
Lines changed: 239 additions & 77 deletions b/‎surya/common/adetr/decoder.py‎
Lines changed: 239 additions & 77 deletions
diff --git a/‎surya/common/donut/encoder.py‎
Lines changed: 2 additions & 2 deletions b/‎surya/common/donut/encoder.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎surya/common/pretrained.py‎
Lines changed: 22 additions & 0 deletions b/‎surya/common/pretrained.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎surya/common/surya/__init__.py‎
Lines changed: 56 additions & 38 deletions b/‎surya/common/surya/__init__.py‎
Lines changed: 56 additions & 38 deletions
diff --git a/‎surya/common/surya/decoder/__init__.py‎
Lines changed: 13 additions & 15 deletions b/‎surya/common/surya/decoder/__init__.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎surya/common/surya/encoder/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎surya/common/surya/encoder/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.16.5"
+version = "0.16.6"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
@@ -13,7 +13,7 @@ packages = [
 
 [tool.poetry.dependencies]
 python = "^3.10"
-transformers = ">=4.51.2,<4.54.0"
+transformers = ">=4.56.1"
 torch = "^2.7.0"
 pydantic = "^2.5.3"
 pydantic-settings = "^2.1.0"
 
@@ -8,7 +8,6 @@
 from torch import nn
 
 from transformers.activations import ACT2FN
-from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import (
     find_pruneable_heads_and_indices,
     meshgrid,
@@ -17,6 +16,7 @@
 from transformers.utils import ModelOutput
 from transformers import DonutSwinConfig
 
+from surya.common.pretrained import SuryaPreTrainedModel
 from surya.common.util import mark_step
 
 _EXPECTED_OUTPUT_SHAPE = [1, 49, 1024]
@@ -932,7 +932,7 @@ def forward(
 
 
 # Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->DonutSwin
-class DonutSwinPreTrainedModel(PreTrainedModel):
+class DonutSwinPreTrainedModel(SuryaPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
 
@@ -0,0 +1,22 @@
+from typing import Optional
+
+from transformers import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
+
+
+class SuryaPreTrainedModel(PreTrainedModel):
+    # No-op if we pass attention, so we can set attention however we want in the config
+    def _check_and_adjust_attn_implementation(
+        self, attn_implementation: Optional[str], **kwargs
+    ):
+        if attn_implementation is None:
+            try:
+                self._sdpa_can_dispatch(True)
+                attn_implementation = "sdpa"
+            except (ValueError, ImportError):
+                attn_implementation = "eager"
+
+            if self._supports_flash_attn and is_flash_attn_2_available():
+                attn_implementation = "flash_attention_2"
+
+        return attn_implementation
@@ -5,11 +5,11 @@
 import torch
 from torch import nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.cache_utils import Cache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
+from surya.common.pretrained import SuryaPreTrainedModel
 from surya.common.s3 import S3DownloaderMixin
 from surya.common.surya.config import SuryaModelConfig
 from surya.common.surya.decoder import SuryaDecoderModel
@@ -56,6 +56,7 @@ class FlashAttentionKwargs(TypedDict, total=False):
 
 class KwargsForCausalLM(FlashAttentionKwargs): ...
 
+
 class DistanceProjection(nn.Module):
     def __init__(self, in_features: int, out_features: int):
         super().__init__()
@@ -75,7 +76,8 @@ def init_weights(self):
         nn.init.zeros_(self.fc1.bias)
         nn.init.zeros_(self.fc2.bias)
 
-class SuryaModel(S3DownloaderMixin, PreTrainedModel):
+
+class SuryaModel(S3DownloaderMixin, SuryaPreTrainedModel):
     config_class = SuryaModelConfig
     supports_gradient_checkpointing = True
     _skip_keys_device_placement = ["past_key_values"]
@@ -95,8 +97,9 @@ def __init__(
         embedder: SimpleTokenEmbedder = None,
         vision_encoder: SuryaEncoderModel = None,
         decoder: SuryaDecoderModel = None,
+        **kwargs,
     ):
-        super().__init__(config)
+        super().__init__(config, **kwargs)
 
         if vision_encoder is None:
             vision_encoder = SuryaEncoderModel(config.vision_encoder)
@@ -166,29 +169,30 @@ def maybe_static_pad_image_inputs(
         chunk_pixels: torch.Tensor,
         chunk_grid_thw: torch.Tensor,
         actual_chunk_len: int,
-        encoder_chunk_size: int
+        encoder_chunk_size: int,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        valid_embed_len = actual_chunk_len // (self.vision_encoder.spatial_merge_size ** 2)
+        valid_embed_len = actual_chunk_len // (
+            self.vision_encoder.spatial_merge_size**2
+        )
         if settings.FOUNDATION_STATIC_CACHE and actual_chunk_len < encoder_chunk_size:
             padding_len = encoder_chunk_size - actual_chunk_len
             padding = torch.zeros(
-                padding_len, 
+                padding_len,
                 *chunk_pixels.shape[1:],
                 device=chunk_pixels.device,
-                dtype=chunk_pixels.dtype
+                dtype=chunk_pixels.dtype,
             )
             chunk_pixels = torch.cat([chunk_pixels, padding], dim=0)
-            
+
             padding_grid = torch.tensor(
                 [[1, 2, padding_len // 2]],
                 device=chunk_grid_thw.device,
-                dtype=chunk_grid_thw.dtype
+                dtype=chunk_grid_thw.dtype,
             )
             chunk_grid_thw = torch.cat([chunk_grid_thw, padding_grid], dim=0)
 
         return chunk_pixels, chunk_grid_thw, valid_embed_len
 
-
     def get_image_embeddings(
         self,
         pixel_values: torch.Tensor,
@@ -225,15 +229,18 @@ def get_image_embeddings(
             end = chunks[i + 1]
             grid_start = grid_chunks[i]
             grid_end = grid_chunks[i + 1]
-            
+
             chunk_pixels = pixel_values[start:end]
             chunk_grid_thw = grid_thw[grid_start:grid_end]
             actual_chunk_len = end - start
-            chunk_pixels, chunk_grid_thw, valid_embed_len = self.maybe_static_pad_image_inputs(chunk_pixels, chunk_grid_thw, actual_chunk_len, encoder_chunk_size)
+            chunk_pixels, chunk_grid_thw, valid_embed_len = (
+                self.maybe_static_pad_image_inputs(
+                    chunk_pixels, chunk_grid_thw, actual_chunk_len, encoder_chunk_size
+                )
+            )
 
             chunk_embeddings = self.vision_encoder.embed_images(
-                image_batch=chunk_pixels,
-                grid_thw=chunk_grid_thw
+                image_batch=chunk_pixels, grid_thw=chunk_grid_thw
             )
             embeddings.append(chunk_embeddings[:valid_embed_len])
 
@@ -340,28 +347,30 @@ def get_2d_learned_embeddings(
         )  # Shape is num_image_tokens x embed_dim
 
     def get_logits(self, hidden_states):
-        assert hidden_states.shape[1] == 1, "Multi output predictions only applied on the last token"
+        assert hidden_states.shape[1] == 1, (
+            "Multi output predictions only applied on the last token"
+        )
 
         all_lm_logits = []
         all_bbox_logits = []
-        
+
         current_hidden = hidden_states
-        
+
         # Loop includes initial prediction (i=0) plus multi_out_distance additional predictions
         for i in range(self.config.multi_output_distance + 1):
             if i > 0:
-                current_hidden = self.multi_output_projections[i-1](current_hidden)
-            
+                current_hidden = self.multi_output_projections[i - 1](current_hidden)
+
             lm_logits = self.lm_head(current_hidden)
             bbox_logits = F.sigmoid(self.bbox_head(current_hidden))
-            
+
             all_lm_logits.append(lm_logits)
             all_bbox_logits.append(bbox_logits)
-        
+
         # Concatenate along sequence dimension (dim=1)
         final_lm_logits = torch.cat(all_lm_logits, dim=1)
         final_bbox_logits = torch.cat(all_bbox_logits, dim=1)
-        
+
         return final_lm_logits, final_bbox_logits
 
     def forward(
@@ -387,24 +396,25 @@ def forward(
         **kwargs: KwargsForCausalLM,
     ):
         # Process the mixed batch if provided
-        if any([
-            input_ids is None,
-            (prefill and (image_tiles is None or grid_thw is None)),
-            position_ids is None,
-            cache_position is None
-        ]):
-            raise ValueError("`input_ids`, `position_ids`, and `cache_position` **must** be specified. `image_tiles` and `grid_thw` are required for prefill")
+        if any(
+            [
+                input_ids is None,
+                (prefill and (image_tiles is None or grid_thw is None)),
+                position_ids is None,
+                cache_position is None,
+            ]
+        ):
+            raise ValueError(
+                "`input_ids`, `position_ids`, and `cache_position` **must** be specified. `image_tiles` and `grid_thw` are required for prefill"
+            )
 
         inputs_embeds = self.embed_ids_boxes_images(
             input_ids, image_tiles, grid_thw, encoder_chunk_size
         )
 
         # Handling flash attention kwargs outside the decoder to speed up + avoid graph breaks inside the decoder
         # Skipped during decoding since not required
-        if (
-            self.decoder.config._attn_implementation == "flash_attention_2"
-            and prefill
-        ):
+        if self.decoder.config._attn_implementation == "flash_attention_2" and prefill:
             batch_size, query_length, _ = inputs_embeds.shape
             indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
                 attention_mask
@@ -451,7 +461,9 @@ def forward(
             bbox_logits = None
             vocab_size = lm_logits.shape[-1]
             labels = torch.roll(labels, shifts=-1, dims=-1)
-            loss = F.cross_entropy(lm_logits.view(-1, vocab_size), labels.view(-1), reduction="mean")
+            loss = F.cross_entropy(
+                lm_logits.view(-1, vocab_size), labels.view(-1), reduction="mean"
+            )
         else:
             lm_logits, bbox_logits = self.get_logits(hidden_states)
 
@@ -561,9 +573,15 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 device=device,
             )
             # Batch-aware diagonal attend mask
-            diagonal_attend_mask = torch.arange(target_length, device=device).unsqueeze(0) > cache_position.unsqueeze(-1)
-            causal_mask = causal_mask.unsqueeze(0) * diagonal_attend_mask  # (batch_size, seq_len, target_len)
-            causal_mask = causal_mask[:, None, :, :]  # (batch_size, 1, seq_len, target_len)
+            diagonal_attend_mask = torch.arange(target_length, device=device).unsqueeze(
+                0
+            ) > cache_position.unsqueeze(-1)
+            causal_mask = (
+                causal_mask.unsqueeze(0) * diagonal_attend_mask
+            )  # (batch_size, seq_len, target_len)
+            causal_mask = causal_mask[
+                :, None, :, :
+            ]  # (batch_size, 1, seq_len, target_len)
             if attention_mask is not None:
                 causal_mask = (
                     causal_mask.clone()
@@ -578,4 +596,4 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
                 causal_mask[:, :, :, :mask_length] = causal_mask[
                     :, :, :, :mask_length
                 ].masked_fill(padding_mask, min_dtype)
-        return causal_mask
+        return causal_mask
@@ -12,11 +12,13 @@
     BaseModelOutputWithPast,
 )
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
-from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 from transformers.processing_utils import Unpack
 from transformers.utils import (
     logging,
 )
+
+from surya.common.pretrained import SuryaPreTrainedModel
 from surya.common.surya.decoder.config import SuryaDecoderConfig
 
 from transformers.utils import is_flash_attn_2_available
@@ -180,15 +182,15 @@ def forward(
 
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            # cache_idxs, num_valid_tokens, and prefill add support for our new caching mechanism 
+            # cache_idxs, num_valid_tokens, and prefill add support for our new caching mechanism
             cache_kwargs = {
                 "sin": sin,
                 "cos": cos,
                 "cache_position": cache_position,
                 "cache_idxs": cache_idxs,
                 "num_valid_tokens": num_valid_tokens,
                 "prefill": prefill,
-                "text_lengths": text_lengths
+                "text_lengths": text_lengths,
             }
             key_states, value_states = past_key_value.update(
                 key_states, value_states, self.layer_idx, cache_kwargs
@@ -406,7 +408,7 @@ def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
 
-class Qwen2PreTrainedModel(PreTrainedModel):
+class Qwen2PreTrainedModel(SuryaPreTrainedModel):
     config_class = SuryaDecoderConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -482,22 +484,18 @@ def forward(
         )
 
         if inputs_embeds is None:
-            raise ValueError(
-                "You must specify inputs_embeds"
-            )
+            raise ValueError("You must specify inputs_embeds")
 
         if cache_position is None:
-            raise ValueError(
-                "You must specify cache_position"
-            )
+            raise ValueError("You must specify cache_position")
 
         if position_ids is None:
-            raise ValueError(
-                "You must specify position_ids"
-            )
+            raise ValueError("You must specify position_ids")
 
         hidden_states = inputs_embeds
-        causal_mask = attention_mask        # We make the 4D mask in the combined model when needed
+        causal_mask = (
+            attention_mask  # We make the 4D mask in the combined model when needed
+        )
 
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
@@ -528,4 +526,4 @@ def forward(
             last_hidden_state=hidden_states,
             past_key_values=past_key_values if use_cache else None,
         )
-        return output if return_dict else output.to_tuple()
+        return output if return_dict else output.to_tuple()
@@ -4,10 +4,10 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import PreTrainedModel
 from transformers.activations import ACT2FN
 from transformers.utils import is_flash_attn_2_available
 
+from surya.common.pretrained import SuryaPreTrainedModel
 from surya.common.surya.encoder.config import SuryaEncoderConfig
 from surya.logging import get_logger
 
@@ -472,7 +472,7 @@ def forward(
 """
 
 
-class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
+class Qwen2_5_VLPreTrainedModel(SuryaPreTrainedModel):
     config_class = SuryaEncoderConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True