Speed up forecasting and making code more readable (#28)

danilyef · Nikita · superbock · web-flow · commit c431947e37e8 · 2025-10-16T11:40:49.000+02:00
* check tests with attention

* check on jupyterlab_docker branch

* chore: skip test_jupyterlab_running if Docker is not available

* chore: run all tests inside Docker container

* freezing dependencies up to next major version

* check tests with attention

* chore: run all tests inside Docker container

* clamping logsigmoid, duplication del and more readable

* fix white-spaces

* Added suggestions for PR and replaced ipure python by torch

* added self.num_heads instead of NH for ONNX compatibility. Otherwise, ONNX conversion throws and error

* Resolving conflicts with main

* Resolve the conflict p2

* jupyterlab_docker is removed

* correct github actions

* Add comment to clamp

---------

Co-authored-by: Nikita &lt;nikita@Nikitas-MacBook-Pro.local&gt;
Co-authored-by: Sebastian Böck &lt;sebastian.boeck@nx-ai.com&gt;
Co-authored-by: martinloretzzz &lt;20306567+martinloretzzz@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -30,4 +30,5 @@ share/python-wheels/
 
 .vscode
 .venv
+.ruff_cache
 torch_compile_debug
diff --git a/src/tirex/models/slstm/cell.py b/src/tirex/models/slstm/cell.py
@@ -100,7 +100,7 @@ def _impl_cuda(self, input: torch.Tensor, state: torch.Tensor) -> torch.Tensor:
 
     def _get_input(self, x: torch.Tensor) -> torch.Tensor:
         assert x.shape[-1] == self.config.embedding_dim * self.config.num_gates, (
-            f"Input size mismatch: Expected input size {self.config.embedding_dim * self.config.num_gates}, but got {input.size(-1)}."
+            f"Input size mismatch: Expected input size {self.config.embedding_dim * self.config.num_gates}, but got {x.size(-1)}."
         )
         return x.view(x.shape[0], x.shape[1], self.config.num_gates, self.config.num_heads, -1).permute(1, 0, 2, 3, 4)
 
@@ -128,7 +128,7 @@ def slstm_forward(
         states: torch.Tensor,  # [4, B, H] only the first is used for recurrence!
         R: torch.Tensor,  # [K, R*H, H] - K num_heads
         b: torch.Tensor,  # [T*H]
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         num_gates = 4
         num_heads = R.shape[0]
         S, B, _ = x.shape
@@ -167,7 +167,7 @@ def slstm_forward_pointwise(
         iraw, fraw, zraw, oraw = torch.unbind(raw.view(raw.shape[0], 4, -1), dim=1)
 
         # Equations reference the xlstm paper on page 4: https://arxiv.org/pdf/2405.04517
-        logfplusm = m + F.logsigmoid(fraw)  # eq 15
+        logfplusm = m + F.logsigmoid(torch.clamp(fraw, max=15))  # eq 15 # Clamp to avoid subnomals
         mnew = torch.where(torch.all(n == 0.0), iraw, torch.max(iraw, logfplusm))  # eq 15
         ogate = torch.sigmoid(oraw)  # eq 14
         igate = torch.minimum(torch.exp(iraw - mnew), torch.ones_like(iraw))  # eq 16
diff --git a/src/tirex/models/slstm/layer.py b/src/tirex/models/slstm/layer.py
@@ -20,7 +20,7 @@ def __init__(self, config: sLSTMBlockConfig, backend: str):
         self.ogate = LinearHeadwiseExpand(in_features, num_heads)
 
         self.slstm_cell = sLSTMCell(self.config, backend)
-        self.group_norm = MultiHeadLayerNorm(ndim=in_features)
+        self.group_norm = MultiHeadLayerNorm(ndim=in_features, num_heads=num_heads)
 
     def forward(self, x: torch.Tensor, slstm_state: torch.Tensor | None = None) -> torch.Tensor:
         x_g = torch.cat((self.fgate(x), self.igate(x), self.zgate(x), self.ogate(x)), dim=-1)
@@ -50,18 +50,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class MultiHeadLayerNorm(nn.Module):
-    def __init__(self, ndim: int):
+    def __init__(self, ndim: int, num_heads: int):
         super().__init__()
         self.weight = nn.Parameter(torch.zeros(ndim))
+        self.num_heads = num_heads
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         assert input.dim() == 4, "Input must be 4D tensor (B, NH, S, DH)"
         B, NH, S, DH = input.shape
 
+        assert NH == self.num_heads
         gn_in_1 = input.transpose(1, 2)  # (B, S, NH, DH)
         gn_in_2 = gn_in_1.reshape(B * S, NH * DH)  # (B * S, NH * DH)
         residual_weight = 1.0 + self.weight
-        out = F.group_norm(gn_in_2, num_groups=NH, weight=residual_weight)
+        out = F.group_norm(gn_in_2, num_groups=self.num_heads, weight=residual_weight)
         # (B * S), (NH * DH) -> (B, S, NH, DH) -> (B, NH, S, DH)
         out = out.view(B, S, NH, DH).transpose(1, 2)
         return out
diff --git a/src/tirex/models/tirex.py b/src/tirex/models/tirex.py
@@ -79,12 +79,18 @@ def _forecast_quantiles(
         training_quantile_levels = self.config.quantiles
 
         if set(quantile_levels).issubset(set(training_quantile_levels)):
-            quantiles = predictions[..., [training_quantile_levels.index(q) for q in quantile_levels]]
+            quantile_indices = torch.tensor(
+                [training_quantile_levels.index(q) for q in quantile_levels],
+                dtype=torch.long,
+                device=predictions.device,
+            )
+            quantiles = torch.index_select(predictions, dim=-1, index=quantile_indices)
         else:
             quantiles = self._interpolate_quantiles(predictions, quantile_levels)
 
         # median as mean
-        mean = predictions[:, :, training_quantile_levels.index(0.5)]
+        median_idx = torch.tensor([training_quantile_levels.index(0.5)], dtype=torch.long, device=predictions.device)
+        mean = torch.index_select(predictions, dim=-1, index=median_idx).squeeze(-1)
         return quantiles, mean
 
     @torch.inference_mode()
@@ -105,24 +111,8 @@ def _forecast_tensor(
 
         context = context.to(dtype=torch.float32)
         while remaining > 0:
-            if context.shape[-1] > max_context:
-                context = context[..., -max_context:]
-            if context.shape[-1] < min_context:
-                pad = torch.full(
-                    (context.shape[0], min_context - context.shape[-1]),
-                    fill_value=torch.nan,
-                    device=context.device,
-                    dtype=context.dtype,
-                )
-                context = torch.concat((pad, context), dim=1)
-            tokenized_tensor, tokenizer_state = self.tokenizer.context_input_transform(context)
             fut_rollouts = min(remaining, max_accelerated_rollout_steps)
-            with torch.no_grad():
-                prediction, _ = self._forward_model_tokenized(input_token=tokenized_tensor, rollouts=fut_rollouts)
-                prediction = prediction[:, :, -fut_rollouts:, :].to(tokenized_tensor)  # predicted token
-                # [bs, num_quantiles, num_predicted_token, output_patch_size]
-            prediction = self.tokenizer.output_transform(prediction, tokenizer_state)
-            prediction = prediction.flatten(start_dim=2)
+            prediction, fut_rollouts = self._forecast_single_step(context, max_context, min_context, fut_rollouts)
 
             predictions.append(prediction)
             remaining -= fut_rollouts
@@ -134,6 +124,33 @@ def _forecast_tensor(
 
         return torch.cat(predictions, dim=-1)[..., :prediction_length].to(dtype=torch.float32)
 
+    def _forecast_single_step(
+        self,
+        context: torch.Tensor,
+        max_context: int,
+        min_context: int,
+        new_patch_count: int = 1,
+    ) -> tuple[torch.Tensor, int]:
+        if context.shape[-1] > max_context:
+            context = context[..., -max_context:]
+        if context.shape[-1] < min_context:
+            pad = torch.full(
+                (context.shape[0], min_context - context.shape[-1]),
+                fill_value=torch.nan,
+                device=context.device,
+                dtype=context.dtype,
+            )
+            context = torch.concat((pad, context), dim=1)
+
+        tokenized_tensor, tokenizer_state = self.tokenizer.context_input_transform(context)
+        prediction, _ = self._forward_model_tokenized(input_token=tokenized_tensor, rollouts=new_patch_count)
+        prediction = prediction[:, :, -new_patch_count:, :].to(tokenized_tensor)  # predicted token
+        # Shape: [bs, num_quantiles, num_predicted_token, output_patch_size]
+        prediction = self.tokenizer.output_transform(prediction, tokenizer_state)
+        prediction = prediction.flatten(start_dim=2)
+
+        return prediction, new_patch_count
+
     def _forward_model_tokenized(
         self,
         input_token: torch.Tensor,
@@ -165,21 +182,7 @@ def _forward_model_tokenized(
 
         input_token = torch.nan_to_num(input_token, nan=self.config.nan_mask_value)
 
-        hidden_states = self.input_patch_embedding(torch.cat((input_token, input_mask), dim=2))
-
-        for block in self.blocks:
-            hidden_states = block(hidden_states)
-
-        hidden_states = self.out_norm(hidden_states)
-
-        quantile_preds = self.output_patch_embedding(hidden_states)
-        quantile_preds = torch.unflatten(
-            quantile_preds, -1, (len(self.config.quantiles), self.config.output_patch_size)
-        )
-        quantile_preds = torch.transpose(quantile_preds, 1, 2)  # switch quantile and num_token_dimension
-        # quantile_preds: [batch_size, num_quantiles, num_token, output_patch_size]
-
-        quantile_preds = self._forward_model(torch.cat((input_token, input_mask), dim=2))
+        quantile_preds, hidden_states = self._forward_model(torch.cat((input_token, input_mask), dim=2))
 
         quantile_preds = torch.unflatten(
             quantile_preds, -1, (len(self.config.quantiles), self.config.output_patch_size)
@@ -196,7 +199,7 @@ def _forward_model(self, input: torch.Tensor):
 
         hidden_states = self.out_norm(hidden_states)
 
-        return self.output_patch_embedding(hidden_states)
+        return self.output_patch_embedding(hidden_states), hidden_states
 
     def _interpolate_quantiles(self, predictions: torch.Tensor, quantile_levels: list[float]):
         training_quantile_levels = self.config.quantiles