huggingface · qgallouedec · Oct 16, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/docs/source/rewards.md b/docs/source/rewards.md
@@ -2,14 +2,14 @@
 
 This module contains some useful reward functions, primarily intended for use with the [`GRPOTrainer`] and [`RLOOTrainer`].
 
-## Format rewards
+## accuracy_reward
 
-### think_format_reward
+[[autodoc]] rewards.accuracy_reward
 
-[[autodoc]] rewards.think_format_reward
+## think_format_reward
 
-## Other rewards
+[[autodoc]] rewards.think_format_reward
 
-### get_soft_overlong_punishment
+## get_soft_overlong_punishment
 
 [[autodoc]] rewards.get_soft_overlong_punishment
diff --git a/examples/scripts/grpo_vlm.py b/examples/scripts/grpo_vlm.py
@@ -70,8 +70,6 @@
 
 import torch
 from datasets import load_dataset
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 
 from trl import (
     GRPOConfig,
@@ -83,7 +81,7 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.rewards import think_format_reward
+from trl.rewards import accuracy_reward, think_format_reward
 
 
 # Enable logging in a Hugging Face Space
@@ -149,54 +147,6 @@ def convert_to_rgb(example):
     train_dataset = dataset["train"]
     eval_dataset = dataset["test"] if training_args.eval_strategy != "no" else None
 
-    ################
-    # Reward Function for Training
-    ################
-    def accuracy_reward(completions, solution: list[str], **kwargs):
-        """Reward function that checks if the completion matches the ground truth.
-        - If both gold and prediction are parseable → use math verification.
-        - If not parseable → compare as normalized text.
-        """
-        rewards = []
-        contents = [completion[0]["content"] for completion in completions]
-        for content, sol in zip(contents, solution):
-            try:
-                gold_parsed = parse(sol, extraction_mode="first_match")
-            except Exception:
-                gold_parsed = []
-
-            if len(gold_parsed) != 0:
-                # Try parsing predicted answer too
-                try:
-                    answer_parsed = parse(
-                        content,
-                        extraction_config=[
-                            LatexExtractionConfig(
-                                normalization_config=NormalizationConfig(
-                                    nits=False,
-                                    malformed_operators=False,
-                                    basic_latex=True,
-                                    boxed="all",
-                                    units=True,
-                                ),
-                                boxed_match_priority=0,
-                                try_extract_without_anchor=False,
-                            )
-                        ],
-                        extraction_mode="first_match",
-                    )
-                    reward = float(verify(gold_parsed, answer_parsed))
-                except Exception as e:
-                    print(f"verify failed: {e}, answer: {content}, gold: {sol}")
-                    reward = None
-            else:
-                # fallback to text match
-                reward = float(content.strip().lower() == sol.strip().lower())
-
-            rewards.append(reward)
-
-        return rewards
-
     ################
     # Training
     ################

diff --git a/examples/scripts/gspo.py b/examples/scripts/gspo.py
@@ -57,8 +57,6 @@
 
 import torch
 from datasets import load_dataset
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 
 from trl import (
     GRPOConfig,
@@ -70,7 +68,7 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.rewards import think_format_reward
+from trl.rewards import accuracy_reward, think_format_reward
 
 
 # Enable logging in a Hugging Face Space
@@ -120,54 +118,6 @@ def make_conversation(example):
     train_dataset = train_dataset.remove_columns(["messages", "problem"])
     eval_dataset = eval_dataset.remove_columns(["messages", "problem"])
 
-    ################
-    # Reward Function for Training
-    ################
-    def accuracy_reward(completions, solution: list[str], **kwargs):
-        """Reward function that checks if the completion matches the ground truth.
-        - If both gold and prediction are parseable → use math verification.
-        - If not parseable → compare as normalized text.
-        """
-        rewards = []
-        contents = [completion[0]["content"] for completion in completions]
-        for content, sol in zip(contents, solution):
-            try:
-                gold_parsed = parse(sol, extraction_mode="first_match")
-            except Exception:
-                gold_parsed = []
-
-            if len(gold_parsed) != 0:
-                # Try parsing predicted answer too
-                try:
-                    answer_parsed = parse(
-                        content,
-                        extraction_config=[
-                            LatexExtractionConfig(
-                                normalization_config=NormalizationConfig(
-                                    nits=False,
-                                    malformed_operators=False,
-                                    basic_latex=True,
-                                    boxed="all",
-                                    units=True,
-                                ),
-                                boxed_match_priority=0,
-                                try_extract_without_anchor=False,
-                            )
-                        ],
-                        extraction_mode="first_match",
-                    )
-                    reward = float(verify(gold_parsed, answer_parsed))
-                except Exception as e:
-                    print(f"verify failed: {e}, answer: {content}, gold: {sol}")
-                    reward = None
-            else:
-                # fallback to text match
-                reward = float(content.strip().lower() == sol.strip().lower())
-
-            rewards.append(reward)
-
-        return rewards
-
     ################
     # Training
     ################

diff --git a/examples/scripts/gspo_vlm.py b/examples/scripts/gspo_vlm.py
@@ -57,8 +57,6 @@
 
 import torch
 from datasets import load_dataset
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 
 from trl import (
     GRPOConfig,
@@ -70,7 +68,7 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.rewards import think_format_reward
+from trl.rewards import accuracy_reward, think_format_reward
 
 
 # Enable logging in a Hugging Face Space
@@ -136,54 +134,6 @@ def convert_to_rgb(example):
     train_dataset = dataset["train"]
     eval_dataset = dataset["test"] if training_args.eval_strategy != "no" else None
 
-    ################
-    # Reward Function for Training
-    ################
-    def accuracy_reward(completions, solution: list[str], **kwargs):
-        """Reward function that checks if the completion matches the ground truth.
-        - If both gold and prediction are parseable → use math verification.
-        - If not parseable → compare as normalized text.
-        """
-        rewards = []
-        contents = [completion[0]["content"] for completion in completions]
-        for content, sol in zip(contents, solution):
-            try:
-                gold_parsed = parse(sol, extraction_mode="first_match")
-            except Exception:
-                gold_parsed = []
-
-            if len(gold_parsed) != 0:
-                # Try parsing predicted answer too
-                try:
-                    answer_parsed = parse(
-                        content,
-                        extraction_config=[
-                            LatexExtractionConfig(
-                                normalization_config=NormalizationConfig(
-                                    nits=False,
-                                    malformed_operators=False,
-                                    basic_latex=True,
-                                    boxed="all",
-                                    units=True,
-                                ),
-                                boxed_match_priority=0,
-                                try_extract_without_anchor=False,
-                            )
-                        ],
-                        extraction_mode="first_match",
-                    )
-                    reward = float(verify(gold_parsed, answer_parsed))
-                except Exception as e:
-                    print(f"verify failed: {e}, answer: {content}, gold: {sol}")
-                    reward = None
-            else:
-                # fallback to text match
-                reward = float(content.strip().lower() == sol.strip().lower())
-
-            rewards.append(reward)
-
-        return rewards
-
     ################
     # Training
     ################

diff --git a/examples/scripts/online_dpo_vlm.py b/examples/scripts/online_dpo_vlm.py
@@ -87,8 +87,6 @@
 import torch
 import transformers
 from datasets import load_dataset
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 from transformers import AutoConfig, AutoProcessor, GenerationConfig
 
 from trl import (
@@ -102,7 +100,7 @@
     get_peft_config,
     get_quantization_config,
 )
-from trl.rewards import think_format_reward
+from trl.rewards import accuracy_reward, think_format_reward
 
 
 # Enable logging in a Hugging Face Space
@@ -192,54 +190,6 @@ def convert_to_rgb(example):
     train_dataset = dataset["train"]
     eval_dataset = dataset["test"] if training_args.eval_strategy != "no" else None
 
-    ################
-    # Reward Function for Training (same as GRPO VLM)
-    ################
-    def accuracy_reward(completions, solution: list[str], **kwargs):
-        """Reward function that checks if the completion matches the ground truth.
-        - If both gold and prediction are parseable → use math verification.
-        - If not parseable → compare as normalized text.
-        """
-        rewards = []
-        contents = [completion[0]["content"] for completion in completions]
-        for content, sol in zip(contents, solution):
-            try:
-                gold_parsed = parse(sol, extraction_mode="first_match")
-            except Exception:
-                gold_parsed = []
-
-            if len(gold_parsed) != 0:
-                # Try parsing predicted answer too
-                try:
-                    answer_parsed = parse(
-                        content,
-                        extraction_config=[
-                            LatexExtractionConfig(
-                                normalization_config=NormalizationConfig(
-                                    nits=False,
-                                    malformed_operators=False,
-                                    basic_latex=True,
-                                    boxed="all",
-                                    units=True,
-                                ),
-                                boxed_match_priority=0,
-                                try_extract_without_anchor=False,
-                            )
-                        ],
-                        extraction_mode="first_match",
-                    )
-                    reward = float(verify(gold_parsed, answer_parsed))
-                except Exception as e:
-                    print(f"verify failed: {e}, answer: {content}, gold: {sol}")
-                    reward = None
-            else:
-                # fallback to text match
-                reward = float(content.strip().lower() == sol.strip().lower())
-
-            rewards.append(reward)
-
-        return rewards
-
     ################
     # Training
     ################

diff --git a/examples/scripts/rloo.py b/examples/scripts/rloo.py
@@ -33,12 +33,10 @@
 
 import torch
 from datasets import load_dataset
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import LatexExtractionConfig, parse, verify
 from peft import LoraConfig
 
 from trl import RLOOConfig, RLOOTrainer
-from trl.rewards import think_format_reward
+from trl.rewards import accuracy_reward, think_format_reward
 
 
 # Enable logging in a Hugging Face Space
@@ -67,52 +65,6 @@ def make_conversation(example):
     train_dataset = train_dataset.map(make_conversation, remove_columns=["messages", "problem"])
     eval_dataset = eval_dataset.map(make_conversation, remove_columns=["messages", "problem"])
 
-    # Reward function for training
-    def accuracy_reward(completions, solution: list[str], **kwargs):
-        """Reward function that checks if the completion matches the ground truth.
-        - If both gold and prediction are parseable → use math verification.
-        - If not parseable → compare as normalized text.
-        """
-        rewards = []
-        contents = [completion[0]["content"] for completion in completions]
-        for content, sol in zip(contents, solution):
-            try:
-                gold_parsed = parse(sol, extraction_mode="first_match")
-            except Exception:
-                gold_parsed = []
-
-            if len(gold_parsed) != 0:
-                # Try parsing predicted answer too
-                try:
-                    answer_parsed = parse(
-                        content,
-                        extraction_config=[
-                            LatexExtractionConfig(
-                                normalization_config=NormalizationConfig(
-                                    nits=False,
-                                    malformed_operators=False,
-                                    basic_latex=True,
-                                    boxed="all",
-                                    units=True,
-                                ),
-                                boxed_match_priority=0,
-                                try_extract_without_anchor=False,
-                            )
-                        ],
-                        extraction_mode="first_match",
-                    )
-                    reward = float(verify(gold_parsed, answer_parsed))
-                except Exception as e:
-                    print(f"verify failed: {e}, answer: {content}, gold: {sol}")
-                    reward = None
-            else:
-                # fallback to text match
-                reward = float(content.strip().lower() == sol.strip().lower())
-
-            rewards.append(reward)
-
-        return rewards
-
     # Training
     training_args = RLOOConfig(
         output_dir="Qwen3-0.6B-RLOO",