no unnecessary logits upcast. fix naming

Datta0 · Datta0 · commit 1aa7aa1236ee · 2025-05-25T13:11:33.000Z
Signed-off-by: datta0 &lt;venkatadattasainimmaturi@gmail.com&gt;
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
@@ -672,10 +672,10 @@ def patch_functions(RLTrainer, trainer_file, RLTrainer_name, all_imports, import
         if trl_version >= "0.18":
             # Replace LLM init with already existing vLLM engine for colocate mode
             vllm_llm_init_pattern = r"self\.llm\s*=\s*LLM\([^)]*\)*\)"
-            vllm_llm_repalcement = "self.llm = model.vllm_engine\n"
+            vllm_llm_replacement = "self.llm = model.vllm_engine\n"
             new_vllm_part = re.sub(
                 vllm_llm_init_pattern,
-                vllm_llm_repalcement,
+                vllm_llm_replacement,
                 new_vllm_part,
                 flags=re.DOTALL  # Ensure . matches newlines [[5]]
             )
diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
@@ -215,18 +215,17 @@ def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep,
         if not hasattr(self, '_autocast_dtype'):
             self._autocast_dtype = torch.float16 if os.environ.get('ACCELERATE_MIXED_PRECISION', 'fp16') == 'fp16' else torch.bfloat16
             if os.environ.get('UNSLOTH_FORCE_FLOAT32', '0') == '1': self._autocast_dtype = torch.float16
-        os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
+        # os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
         with torch.amp.autocast(device_type = 'cuda', dtype = self._autocast_dtype):
             # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
             logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
             logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
-            logits = logits.to(torch.float32)
+            # logits = logits.to(torch.float32)
             input_ids = input_ids[:, -logits_to_keep:]
             # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
             # See https://github.com/huggingface/trl/issues/2770
             logits = logits[:, -logits_to_keep:]
-            #return logits
-            return selective_log_softmax(logits, input_ids)  #  compute logprobs for the input tokens
+            return logits
         pass
     pass