bghira · bghira · Oct 26, 2025 · Oct 26, 2025
diff --git a/simpletuner/helpers/models/chroma/pipeline.py b/simpletuner/helpers/models/chroma/pipeline.py
@@ -841,19 +841,23 @@ def _get_t5_prompt_embeds(
             return_tensors="pt",
         )
         text_input_ids = text_inputs.input_ids
-        attention_mask = text_inputs.attention_mask.clone()
-
-        seq_lengths = attention_mask.sum(dim=1)
-        mask_indices = torch.arange(attention_mask.size(1)).unsqueeze(0).expand(batch_size, -1)
-        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).bool()
+        tokenizer_mask = text_inputs.attention_mask
+        tokenizer_mask_device = tokenizer_mask.to(device)
 
+        # unlike Flux, Chroma uses the tokenizer's attention mask when generating the T5 embeddings
         prompt_embeds = self.text_encoder(
-            text_input_ids.to(device), output_hidden_states=False, attention_mask=attention_mask.to(device)
+            text_input_ids.to(device),
+            output_hidden_states=False,
+            attention_mask=tokenizer_mask_device,
         )[0]
 
         dtype = self.text_encoder.dtype
         prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-        attention_mask = attention_mask.to(device=device)
+
+        # for the text tokens, Chroma requires that all except the first padding token are masked out
+        seq_lengths = tokenizer_mask_device.sum(dim=1)
+        mask_indices = torch.arange(tokenizer_mask_device.size(1), device=device).unsqueeze(0).expand(batch_size, -1)
+        attention_mask = (mask_indices <= seq_lengths.unsqueeze(1)).to(dtype=dtype, device=device)
 
         _, seq_len, _ = prompt_embeds.shape
 
@@ -1154,7 +1158,15 @@ def _prepare_attention_mask(
             return attention_mask
 
         attention_mask = torch.cat(
-            [attention_mask, torch.ones(batch_size, sequence_length, device=attention_mask.device, dtype=torch.bool)],
+            [
+                attention_mask,
+                torch.ones(
+                    batch_size,
+                    sequence_length,
+                    device=attention_mask.device,
+                    dtype=attention_mask.dtype,
+                ),
+            ],
             dim=1,
         )