diff --git a/pyproject.toml b/pyproject.toml index 523794ee1..f9a33a861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,10 +37,10 @@ triton = [ ] huggingface = [ - "unsloth_zoo>=2025.5.8", + "unsloth_zoo>=2025.5.10", "packaging", "tyro", - "transformers==4.51.3,!=4.47.0", + "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2", "datasets>=3.4.1", "sentencepiece>=0.2.0", "tqdm", @@ -48,7 +48,7 @@ huggingface = [ "wheel>=0.42.0", "numpy", "accelerate>=0.34.1", - "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,<=0.15.2", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0", "peft>=0.7.1,!=0.11.0", "protobuf<4.0.0", "huggingface_hub", @@ -381,10 +381,10 @@ colab-ampere-torch220 = [ "flash-attn>=2.6.3", ] colab-new = [ - "unsloth_zoo>=2025.5.8", + "unsloth_zoo>=2025.5.9", "packaging", "tyro", - "transformers==4.51.3,!=4.47.0", + "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2", "datasets>=3.4.1", "sentencepiece>=0.2.0", "tqdm", @@ -399,7 +399,7 @@ colab-new = [ ] colab-no-deps = [ "accelerate>=0.34.1", - "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0,<=0.15.2", + "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0", "peft>=0.7.1", "xformers", "bitsandbytes>=0.45.5", diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 964e874c5..932542806 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2025.5.7" +__version__ = "2025.5.8" __all__ = [ "SUPPORTS_BFLOAT16", diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index b385dba2e..e5cb22643 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -395,7 +395,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): if trainer_file in RL_METRICS_CHANGES: process_extra_args = RL_METRICS_CHANGES[trainer_file] for process_extra_arg in process_extra_args: - other_metrics_processor += process_extra_arg(call_args, extra_args) + other_metrics_processor += process_extra_arg(old_RLTrainer_source, old_RLConfig_source) pass # Add statistics as well! @@ -481,6 +481,39 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): extra_args += num_proc_check pass + # Check for loss_type = dr_grpo and scale_rewards for GRPO + if "loss_type" in call_args and "scale_rewards" in call_args: + check_dr_grpo = \ + "if loss_type.lower() == 'dr_grpo':\n"\ + " loss_type = 'dr_grpo'\n"\ + "elif loss_type.lower() == 'dapo':\n"\ + " loss_type = 'dapo'\n"\ + "if loss_type.lower() == 'dr_grpo':\n"\ + " if scale_rewards == None:\n"\ + " scale_rewards = True\n"\ + " elif scale_rewards == True:\n"\ + " print('The Dr GRPO paper recommends setting `scale_rewards` to False! Will override. Set it to `None` to force False.')\n"\ + " scale_rewards = False\n"\ + "elif loss_type.lower() == 'dapo':\n"\ + " print('The DAPO paper recommends `mask_truncated_completions = True`')\n"\ + " print('The DAPO paper recommends `epsilon_high = 0.28`')\n"\ + " mask_truncated_completions = True\n"\ + " epsilon_high = 0.28\n"\ + "\n" + extra_args += check_dr_grpo + pass + + # Check GRPO num_generations mismatch + if "per_device_train_batch_size" in call_args and "num_generations" in call_args: + check_num_generations = \ + "if (per_device_train_batch_size // num_generations) * num_generations != per_device_train_batch_size:\n"\ + " print('Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\\n"\ + "We will change the batch size of ' + str(per_device_train_batch_size) + ' to the `num_generations` of ' + str(num_generations))\n"\ + " per_device_train_batch_size = num_generations\n"\ + "\n" + extra_args += check_num_generations + pass + # Edit config with anything extra if trainer_file in RL_CONFIG_CHANGES: process_extra_args = RL_CONFIG_CHANGES[trainer_file] diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py index 2ff0e253e..171e75d19 100644 --- a/unsloth/models/rl_replacements.py +++ b/unsloth/models/rl_replacements.py @@ -363,13 +363,27 @@ def grpo_trainer_fix_batch_size(RLTrainer_source, RLConfig_source): def grpo_trainer_metrics(RLTrainer_source, RLConfig_source): if "reward_funcs" not in RLTrainer_source: return "" + # For new TRL we have /mean and /std + use_mean = "rewards/{reward_func_name}/mean" in RLTrainer_source + use_std = "rewards/{reward_func_name}/std" in RLTrainer_source + if not use_mean: + use_normal = "rewards/{reward_func_name}" in RLTrainer_source + else: + use_normal = False + pass + log_metrics = \ "if not isinstance(reward_funcs, list): _reward_funcs = [reward_funcs]\n"\ "else: _reward_funcs = reward_funcs\n"\ "for reward_func in _reward_funcs:\n"\ " try:\n"\ " reward_func_name = reward_func.__name__\n"\ - " other_metrics.append(f'rewards/{reward_func_name}')\n"\ + f" if {use_mean}:\n"\ + " other_metrics.append(f'rewards/{reward_func_name}/mean')\n"\ + f" if {use_std}:\n"\ + " other_metrics.append(f'rewards/{reward_func_name}/std')\n"\ + f" if {use_normal}:\n"\ + " other_metrics.append(f'rewards/{reward_func_name}')\n"\ " except: pass\n" return log_metrics pass