Datta0 · Datta0 · May 25, 2025 · May 25, 2025 · May 25, 2025 · May 25, 2025
diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
@@ -214,18 +214,28 @@ def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep,
         if not hasattr(self, '_autocast_dtype'):
             self._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16
             if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': self._autocast_dtype = torch.float16
-        os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
+        os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
         with torch.amp.autocast(device_type = 'cuda', dtype = self._autocast_dtype):
             # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
-            logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
-            logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
-            logits = logits.to(torch.float32)
+            hidden_states = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+            #breakpoint()
+            #logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+            return hidden_states
             input_ids = input_ids[:, -logits_to_keep:]
             # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
             # See https://github.com/huggingface/trl/issues/2770
             logits = logits[:, -logits_to_keep:]
+            logits = logits.to(torch.float32)
             #return logits
-            return selective_log_softmax(logits, input_ids)  #  compute logprobs for the input tokens
+            logps = selective_log_softmax(logits, input_ids)
+
+            # row_indices, col_indices = torch.where(logps < -20)
+
+            # # Method 1: Check if tensors have elements
+            # if len(row_indices) > 0 and len(col_indices) > 0:
+            #     breakpoint()  # Breakpoint triggered here
+            #     print("Found high values!")
+            return  logps #  compute logprobs for the input tokens
         pass
     pass
 
@@ -280,16 +290,16 @@ def compute_loss(self, model, inputs, return_outputs = False, num_items_in_batch
         # per_token_loss = -(per_token_loss - self.beta * per_token_kl)
         # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
 
-        old_per_token_logps = inputs["old_per_token_logps"]
+        old_hidden_states = inputs["old_per_token_logps"]
         input_ids = input_ids[:, -logits_to_keep:]
-        #breakpoint()
         if per_token_logps is not None:
+            #
             loss, completion_length, mean_kl = grpo_compute_loss_slow(
-                ref_per_token_logps, old_per_token_logps, per_token_logps, input_ids, completion_mask, self.beta, advantages,
+                old_hidden_states, per_token_logps, ref_per_token_logps, input_ids, completion_mask, self.beta, advantages,
             )
         else:
             loss, completion_length, mean_kl = grpo_accumulated_loss(
-                self, _input_ids, logits_to_keep, completion_mask, advantages, old_per_token_logps,
+                self, _input_ids, logits_to_keep, completion_mask, advantages, old_hidden_states,
                 n_chunks = self.args.unsloth_num_chunks
             )