amazon-science · abdulfatir · Oct 22, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/scripts/training/train.py b/scripts/training/train.py
@@ -663,7 +663,6 @@ def main(
         lr_scheduler_type=lr_scheduler_type,
         warmup_ratio=warmup_ratio,
         optim=optim,
-        logging_dir=str(output_dir / "logs"),
         logging_strategy="steps",
         logging_steps=log_steps,
         save_strategy="steps",

diff --git a/src/chronos/chronos2/config.py b/src/chronos/chronos2/config.py
@@ -39,6 +39,8 @@ class Chronos2CoreConfig(PretrainedConfig):
         Token ID for padding/missing value token, by default 0
     rope_theta
         The base theta for rotary position embedding (RoPE), by default 10000.0
+    attn_implementation
+        The attention implementation to use. Options: "eager", "sdpa", "flash_attention_2", by default None (uses "sdpa")
     """
 
     model_type = "t5"
@@ -63,6 +65,7 @@ def __init__(
         vocab_size: int = 2,
         pad_token_id: int = 0,
         rope_theta: float = 10000.0,
+        attn_implementation: str | None = None,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -83,6 +86,9 @@ def __init__(
 
         assert not self.is_gated_act, "gated activation is not supported"
 
+        # Attention implementation - default to "sdpa" if not specified
+        self._attn_implementation = attn_implementation or "sdpa"
+
         # unused
         kwargs.pop("is_encoder_decoder", None)
         kwargs.pop("eos_token_id", None)

diff --git a/src/chronos/chronos2/layers.py b/src/chronos/chronos2/layers.py
@@ -155,6 +155,7 @@ def __init__(self, config: Chronos2CoreConfig, use_rope: bool = True):
         self.n_heads: int = config.num_heads
         self.dropout: float = config.dropout_rate
         self.inner_dim: int = self.n_heads * self.kv_proj_dim
+        self.config = config
 
         self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
         self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
@@ -165,6 +166,123 @@ def __init__(self, config: Chronos2CoreConfig, use_rope: bool = True):
         if use_rope:
             self.rope_embed = RoPE(dim=self.kv_proj_dim, base=config.rope_theta)
 
+    def _eager_attention(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Eager attention implementation using manual matmul.
+
+        Args:
+            query_states: [batch, n_heads, seq_len, kv_proj_dim]
+            key_states: [batch, n_heads, seq_len, kv_proj_dim]
+            value_states: [batch, n_heads, seq_len, kv_proj_dim]
+            mask: [batch, n_heads, q_len, kv_len]
+
+        Returns:
+            attn_output: [batch, n_heads, seq_len, kv_proj_dim]
+            attn_weights: [batch, n_heads, q_len, kv_len]
+        """
+        # Compute attention weights (no scaling - this is the original Chronos-2 implementation)
+        scores = torch.matmul(query_states, key_states.transpose(3, 2))  # "bnqd,bnkd->bnqk"
+        scores += mask
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        return attn_output, attn_weights
+
+    def _sdpa_attention(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, None]:
+        """SDPA attention implementation using torch.nn.functional.scaled_dot_product_attention.
+
+        Args:
+            query_states: [batch, n_heads, seq_len, kv_proj_dim]
+            key_states: [batch, n_heads, seq_len, kv_proj_dim]
+            value_states: [batch, n_heads, seq_len, kv_proj_dim]
+            mask: [batch, n_heads, q_len, kv_len] - additive mask (0 for valid, -inf for invalid)
+
+        Returns:
+            attn_output: [batch, n_heads, seq_len, kv_proj_dim]
+            attn_weights: None (SDPA doesn't return weights)
+        """
+        attn_output = nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            scale=1.0,  # Match eager implementation (no scaling)
+        )
+
+        return attn_output, None
+
+    def _flash_attention_2(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        mask: torch.Tensor,
+    ) -> tuple[torch.Tensor, None]:
+        """FlashAttention-2 implementation.
+
+        Args:
+            query_states: [batch, n_heads, seq_len, kv_proj_dim]
+            key_states: [batch, n_heads, seq_len, kv_proj_dim]
+            value_states: [batch, n_heads, seq_len, kv_proj_dim]
+            mask: [batch, n_heads, q_len, kv_len]
+
+        Returns:
+            attn_output: [batch, n_heads, seq_len, kv_proj_dim]
+            attn_weights: None (FlashAttention doesn't return weights)
+        """
+        try:
+            from flash_attn import flash_attn_func
+        except ImportError:
+            raise ImportError(
+                "FlashAttention-2 is not installed. Please install it with: "
+                "pip install flash-attn --no-build-isolation"
+            )
+
+        # FlashAttention expects inputs in shape [batch, seq_len, n_heads, head_dim]
+        # We have [batch, n_heads, seq_len, head_dim], so we need to transpose
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # FlashAttention only supports fp16 and bf16
+        input_dtype = query_states.dtype
+        if input_dtype not in [torch.float16, torch.bfloat16]:
+            target_dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = flash_attn_func(
+            query_states,
+            key_states,
+            value_states,
+            dropout_p=self.dropout if self.training else 0.0,
+            softmax_scale=1.0,  # Match eager implementation (no scaling)
+            causal=False,  # Chronos uses bidirectional attention by default
+        )
+
+        # Convert back to original dtype if needed
+        if input_dtype not in [torch.float16, torch.bfloat16]:
+            attn_output = attn_output.to(input_dtype)
+
+        # Transpose back to [batch, n_heads, seq_len, head_dim]
+        attn_output = attn_output.transpose(1, 2)
+
+        return attn_output, None
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -190,6 +308,11 @@ def forward(
         if self.use_rope:
             assert position_ids is not None, "position_ids must be provided when self.use_rope=True"
 
+        # Force eager attention if output_attentions is True (only eager returns weights)
+        attn_implementation = self.config._attn_implementation
+        if output_attentions and attn_implementation != "eager":
+            attn_implementation = "eager"
+
         seq_length = hidden_states.shape[1]
 
         def shape(states: torch.Tensor) -> torch.Tensor:
@@ -215,12 +338,13 @@ def unshape(states: torch.Tensor) -> torch.Tensor:
                 cos, sin = self.rope_embed(value_states, position_ids)
                 query_states, key_states = RoPE.apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
-        # Compute attention weights
-        scores = torch.matmul(query_states, key_states.transpose(3, 2))  # "bnqd,bnkd->bnqk"
-        scores += mask
-        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
+        # Dispatch to appropriate attention implementation
+        if attn_implementation == "sdpa":
+            attn_output, attn_weights = self._sdpa_attention(query_states, key_states, value_states, mask)
+        elif attn_implementation == "flash_attention_2":
+            attn_output, attn_weights = self._flash_attention_2(query_states, key_states, value_states, mask)
+        else:  # eager or default
+            attn_output, attn_weights = self._eager_attention(query_states, key_states, value_states, mask)
 
         # Project attention output
         attn_output = unshape(attn_output)

diff --git a/src/chronos/chronos2/model.py b/src/chronos/chronos2/model.py
@@ -199,6 +199,8 @@ class Chronos2Model(PreTrainedModel):
     config_class = Chronos2CoreConfig  # type: ignore[assignment]
     _supports_long_horizon: bool = True
     _supports_future_covariates: bool = True
+    _supports_sdpa: bool = True
+    _supports_flash_attn_2: bool = True
 
     def __init__(self, config: Chronos2CoreConfig):
         assert hasattr(config, "chronos_config"), "Not a valid Chronos config"

diff --git a/src/chronos/chronos2/pipeline.py b/src/chronos/chronos2/pipeline.py
@@ -211,7 +211,6 @@ def fit(
             lr_scheduler_type="linear",
             warmup_ratio=0.0,
             optim="adamw_torch_fused",
-            logging_dir=str(output_dir / "logs"),
             logging_strategy="steps",
             logging_steps=100,
             disable_tqdm=False,