unslothai · AshAnand34 · May 30, 2025
diff --git a/unsloth_zoo/temporary_patches/gemma.py b/unsloth_zoo/temporary_patches/gemma.py
@@ -262,10 +262,6 @@ def forward(
             logits_to_keep=logits_to_keep,
             **lm_kwargs,
         )
-        labels = None
-        # We NEVER ENTER if labels is not None: since we already accounted for it
-
-
         logits = outputs.logits
         loss = None
         if labels is not None:
@@ -288,7 +284,7 @@ def forward(
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             flat_labels = shift_labels.view(-1).to(shift_logits.device)
             loss = loss_fct(flat_logits, flat_labels)
-        loss = outputs.loss
+        loss = getattr(outputs, "loss", loss)
 
         return Gemma3CausalLMOutputWithPast(
             loss=loss,
@@ -352,14 +348,7 @@ def forward(
             cache_position=cache_position,
             **lm_kwargs,
         )
-        labels = None
-        # We NEVER ENTER if labels is not None: since we already accounted for it
-
-        hidden_states = outputs[0]
-        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-
+        logits = outputs.logits
         loss = None
         if labels is not None:
             # Upcast to float if we need to compute the loss to avoid potential precision issues
@@ -381,7 +370,7 @@ def forward(
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             flat_labels = shift_labels.view(-1).to(shift_logits.device)
             loss = loss_fct(flat_logits, flat_labels)
-        loss = outputs.loss
+        loss = getattr(outputs, "loss", loss)
 
         if not return_dict:
             output = (logits,) + outputs[1:]