inclusionAI
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎areal/api/cli_args.py‎
Lines changed: 59 additions & 10 deletions b/‎areal/api/cli_args.py‎
Lines changed: 59 additions & 10 deletions
diff --git a/‎areal/api/reward_api.py‎
Lines changed: 1 addition & 1 deletion b/‎areal/api/reward_api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎areal/core/async_task_runner.py‎
Lines changed: 13 additions & 2 deletions b/‎areal/core/async_task_runner.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎areal/engine/ppo/actor.py‎
Lines changed: 58 additions & 21 deletions b/‎areal/engine/ppo/actor.py‎
Lines changed: 58 additions & 21 deletions
@@ -24,10 +24,10 @@ our project just as you enjoy real-world milk tea (cheers).
   [multi-turn agentic rollout](https://inclusionai.github.io/AReaL/customization/agent.html)
   workflows within a single file, and smooth integration with
   [other agentic tooling frameworks](https://inclusionai.github.io/AReaL/tutorial/agentic_rl.html).
-- 🚀 **Scalability**: Through algorithm-system co-design, AReaL delivers **stable** fully
+- 📈 **Scalability**: Through algorithm-system co-design, AReaL delivers **stable** fully
   asynchronous RL training with **industry-leading speed**. AReaL seamlessly adapts to
   diverse computational environments, scaling from a single node to 1,000+ GPUs.
-- 🔪 **Cutting-Edge Performance**: AReaL produces state-of-the-art
+- ✨ **Cutting-Edge Performance**: AReaL produces state-of-the-art
   [math](/blog/AReaL_v0_2.md), [coding](/blog/AReaL_v0_3.md), and
   [search agents](https://github.com/inclusionAI/ASearcher) with exceptional
   capabilities.
 
@@ -4,24 +4,25 @@
 from dataclasses import MISSING as dataclass_missing
 from dataclasses import asdict, dataclass, field, fields
 from pathlib import Path
-from typing import Any, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import uvloop
 import yaml
 from hydra import compose as hydra_compose
 from hydra import initialize as hydra_init
 from hydra.core.global_hydra import GlobalHydra
 from omegaconf import MISSING, DictConfig, OmegaConf
-from transformers import PreTrainedTokenizerFast
 
-from areal.platforms import current_platform
 from areal.utils import logging, name_resolve, pkg_version
 from areal.utils.constants import (
     PROX_LOGP_METHOD_RECOMPUTE,
     PROX_LOGP_METHODS_ALL,
 )
 from areal.utils.pkg_version import is_version_less
 
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizerFast
+
 uvloop.install()
 
 logger = logging.getLogger("CLI args")
@@ -174,7 +175,7 @@ def new(self, **kwargs):
         args.update(kwargs)
         return GenerationHyperparameters(**args)
 
-    def new_with_stop_and_pad_token_ids(self, tokenizer: PreTrainedTokenizerFast):
+    def new_with_stop_and_pad_token_ids(self, tokenizer: "PreTrainedTokenizerFast"):
         """Create a new generation hyperparameters with stop and pad token ids added."""
         new_stop_token_ids = self.stop_token_ids.copy()
         if tokenizer.pad_token_id not in new_stop_token_ids:
@@ -183,8 +184,29 @@ def new_with_stop_and_pad_token_ids(self, tokenizer: PreTrainedTokenizerFast):
             new_stop_token_ids.append(tokenizer.eos_token_id)
         return self.new(stop_token_ids=new_stop_token_ids)
 
-    def to_openai_args_dict(
+    def to_openai_completions_args_dict(
+        self, exclude_args: list[str] | None = None
+    ) -> dict[str, Any]:
+        return self.to_openai_args_dict(
+            exclude_args=exclude_args, api_format="completions"
+        )
+
+    def to_openai_responses_args_dict(
         self, exclude_args: list[str] | None = None
+    ) -> dict[str, Any]:
+        return self.to_openai_args_dict(
+            exclude_args=exclude_args, api_format="responses"
+        )
+
+    def to_openai_agents_model_settings_dict(
+        self, exclude_args: list[str] | None = None
+    ) -> dict[str, Any]:
+        return self.to_openai_args_dict(
+            exclude_args=exclude_args, api_format="openai-agents"
+        )
+
+    def to_openai_args_dict(
+        self, exclude_args: list[str] | None = None, api_format: str = "completions"
     ) -> dict[str, Any]:
         """Convert the generation hyperparameters to a dictionary of arguments for OpenAI client."""
         final_exclude_args = set(exclude_args) if exclude_args is not None else set()
@@ -195,14 +217,22 @@ def to_openai_args_dict(
                 "top_k",  # Not supported by OpenAI
                 "stop_token_ids",  # Not supported by OpenAI
                 "lora_name",  # Not supported by OpenAI
+                "max_tokens",  # deprecated by "completions", not used in "responses", should be `max_new_tokens` in "openai-agents"
             }
         )
         # TODO: move the excluded args into extra body, so they can be passed through the client request
 
-        mapping = {
-            "n_samples": "n",
-            "max_new_tokens": "max_completion_tokens",
-        }
+        mapping = {"n_samples": "n"}
+        if api_format == "completions":
+            mapping["max_new_tokens"] = "max_completion_tokens"
+        elif api_format == "responses":
+            mapping["max_new_tokens"] = "max_output_tokens"
+        elif api_format == "openai-agents":
+            # NOTE: max_tokens in openai-agents means `max_new_tokens` in sglang/vllm. This is not a bug
+            mapping["max_new_tokens"] = "max_tokens"
+        else:
+            raise ValueError(f"Unsupported API format: {api_format}")
+
         res = {}
         for k, v in asdict(self).items():
             if k in final_exclude_args:
@@ -224,7 +254,10 @@ def to_openai_args_dict(
                         f"Unsupported arg for openai format: `{k}` with value {current_value}"
                     )
                 continue
-            res[mapping.get(k, k)] = v
+            key = mapping.get(k, k)
+            if key in res:
+                logger.warning(f"Overriding key: {key} from {k} with value: {v}")
+            res[key] = v
 
         return res
 
@@ -624,6 +657,20 @@ class PPOActorConfig(TrainEngineConfig):
         metadata={"help": "KL divergence estimator", "choices": ["k1", "k2", "k3"]},
     )
 
+    # SAPO (Soft Adaptive Policy Optimization) - https://arxiv.org/abs/2511.20347
+    use_sapo_loss: bool = field(
+        default=False,
+        metadata={"help": "Use SAPO loss (mutually exclusive with PPO clipping)"},
+    )
+    sapo_tau_pos: float = field(
+        default=1.0,
+        metadata={"help": "SAPO temperature for positive advantages"},
+    )
+    sapo_tau_neg: float = field(
+        default=1.05,
+        metadata={"help": "SAPO temperature for negative advantages"},
+    )
+
     # Asynchronous RL
     recompute_logprob: bool = field(
         default=False,
@@ -956,6 +1003,8 @@ def build_args(
             args["lora_target_modules"] = [
                 x.replace("-linear", "") for x in args["lora_target_modules"]
             ]
+        from areal.platforms import current_platform
+
         args = dict(
             # Model and tokenizer
             tokenizer_path=sglang_config.model_path,
 
@@ -107,7 +107,7 @@ def _cleanup_executor(cls, executor_key):
                 if cls._instance_counts[executor_key] <= 0:
                     if executor_key in cls._executors:
                         executor = cls._executors.pop(executor_key)
-                        executor.shutdown(wait=True)
+                        executor.shutdown(wait=False, cancel_futures=True)
                         logger.debug(
                             f"ProcessPoolExecutor with {executor_key} workers shut down"
                         )
 
@@ -234,18 +234,29 @@ def initialize(self, logger=None):
         self.thread.start()
         self._loop_ready.wait()
 
-    def destroy(self):
+    def destroy(self, timeout: float = 30.0):
         """Shutdown the task runner and wait for thread cleanup.
 
         This method signals the background thread to exit and waits for
         it to complete. All pending tasks will be cancelled.
+
+        Parameters
+        ----------
+        timeout : float, optional
+            Maximum time in seconds to wait for thread to exit.
+            Default is 30.0 seconds.
         """
         self.exiting.set()
         self.paused.clear()
 
         self._signal_new_input()
         if self.thread is not None:
-            self.thread.join()
+            self.thread.join(timeout=timeout)
+            if self.thread.is_alive():
+                if self.logger:
+                    self.logger.warning(
+                        f"Background thread did not exit within {timeout}s timeout."
+                    )
 
     def register_shutdown_hook(self, hook: Callable[[], Awaitable[None]]) -> None:
         """Register an async cleanup function to be called during shutdown.
 
@@ -27,6 +27,7 @@
     dynamic_sampling,
     ppo_actor_loss_fn,
     reward_overlong_penalty,
+    sapo_loss_fn,
 )
 from areal.utils.perf_tracer import trace_perf
 
@@ -353,6 +354,10 @@ def ppo_update(self, data: dict[str, Any]) -> None:
                         importance_sampling_level=self.config.importance_sampling_level,
                         current_version=current_version,
                         prox_logp_method=self.config.prox_logp_method,
+                        use_sapo_loss=self.config.use_sapo_loss,
+                        sapo_tau_pos=self.config.sapo_tau_pos,
+                        sapo_tau_neg=self.config.sapo_tau_neg,
+                        use_decoupled_loss=self.config.use_decoupled_loss,
                     ),
                     loss_weight_fn=lambda x: x["loss_mask"].count_nonzero(),
                 )
@@ -405,6 +410,10 @@ def grpo_loss_fn(
     importance_sampling_level: str = "token",
     current_version: int | None = None,
     prox_logp_method: str = PROX_LOGP_METHOD_RECOMPUTE,
+    use_sapo_loss: bool = False,
+    sapo_tau_pos: float = 1.0,
+    sapo_tau_neg: float = 1.05,
+    use_decoupled_loss: bool = False,
     vocab_min_logits: torch.Tensor | None = None,
     vocab_max_logits: torch.Tensor | None = None,
 ):
@@ -431,19 +440,37 @@ def grpo_loss_fn(
     if m2_threshold is not None:
         loss_mask = _apply_m2po_masking(old_logp, prox_logp, loss_mask, m2_threshold)
 
-    loss, stat = ppo_actor_loss_fn(
-        logprobs=logprobs,
-        old_logprobs=old_logp,
-        advantages=advantages,
-        eps_clip=eps_clip,
-        eps_clip_higher=eps_clip_higher,
-        loss_mask=loss_mask,
-        c_clip=c_clip,
-        proximal_logprobs=prox_logp,
-        behav_imp_weight_cap=behav_imp_weight_cap,
-        importance_sampling_level=importance_sampling_level,
-        cu_seqlens=input_data.get("cu_seqlens"),
-    )
+    # Use SAPO or PPO loss
+    if use_sapo_loss:
+        if use_decoupled_loss:
+            raise ValueError(
+                "SAPO is not compatible with `use_decoupled_loss=True`. "
+                "Please set `actor.use_decoupled_loss=false` in your configuration."
+            )
+        loss, stat = sapo_loss_fn(
+            logprobs=logprobs,
+            old_logprobs=old_logp,
+            advantages=advantages,
+            tau_pos=sapo_tau_pos,
+            tau_neg=sapo_tau_neg,
+            loss_mask=loss_mask,
+            importance_sampling_level=importance_sampling_level,
+            cu_seqlens=input_data.get("cu_seqlens"),
+        )
+    else:
+        loss, stat = ppo_actor_loss_fn(
+            logprobs=logprobs,
+            old_logprobs=old_logp,
+            advantages=advantages,
+            eps_clip=eps_clip,
+            eps_clip_higher=eps_clip_higher,
+            loss_mask=loss_mask,
+            c_clip=c_clip,
+            proximal_logprobs=prox_logp,
+            behav_imp_weight_cap=behav_imp_weight_cap,
+            importance_sampling_level=importance_sampling_level,
+            cu_seqlens=input_data.get("cu_seqlens"),
+        )
 
     # Log training statistics
     stats_tracker.denominator(
@@ -483,14 +510,24 @@ def grpo_loss_fn(
             denominator="n_tokens",
         )
 
-    clip_mask = stat["clip_mask"]
-    clipped_new_logp = torch.where(clip_mask, logprobs.detach(), 0.0)
-    clipped_old_logp = torch.where(clip_mask, old_logp, 0.0)
-    stats_tracker.stat(
-        clipped_new_logp=clipped_new_logp,
-        clipped_old_logp=clipped_old_logp,
-        denominator="clipped_tokens",
-    )
+    # Log SAPO-specific statistics
+    if use_sapo_loss:
+        stats_tracker.stat(
+            sapo_soft_gate=stat["sapo_soft_gate"],
+            sapo_scaled_gate_pos=stat["sapo_scaled_gate_pos"],
+            sapo_scaled_gate_neg=stat["sapo_scaled_gate_neg"],
+            denominator="n_valid_tokens",
+        )
+    else:
+        # Log clipping statistics (PPO only)
+        clip_mask = stat["clip_mask"]
+        clipped_new_logp = torch.where(clip_mask, logprobs.detach(), 0.0)
+        clipped_old_logp = torch.where(clip_mask, old_logp, 0.0)
+        stats_tracker.stat(
+            clipped_new_logp=clipped_new_logp,
+            clipped_old_logp=clipped_old_logp,
+            denominator="clipped_tokens",
+        )
 
     # Log proximal approximation metrics
     compute_logp_mask = stat.get("behave_mask", loss_mask)
Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ def _cleanup_executor(cls, executor_key):`
`107`	`107`	`if cls._instance_counts[executor_key] <= 0:`
`108`	`108`	`if executor_key in cls._executors:`
`109`	`109`	`executor = cls._executors.pop(executor_key)`
`110`		`- executor.shutdown(wait=True)`
	`110`	`+ executor.shutdown(wait=False, cancel_futures=True)`
`111`	`111`	`logger.debug(`
`112`	`112`	`f"ProcessPoolExecutor with {executor_key} workers shut down"`
`113`	`113`	`)`