volcengine · vermouth1992 · Sep 18, 2025 · Sep 18, 2025 · gemini-code-assist · Sep 18, 2025
@@ -288,6 +288,13 @@ def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
         ]
         if self.config.use_kl_loss:
             select_keys.append("ref_log_prob")
+        if self.config.tis_imp_ratio_cap > 0:
+            assert "rollout_log_probs" in data.batch.keys(), (
+                "Truncated Importance Sampling (TIS) requires to configure "
+                "`actor_rollout_ref.rollout.calculate_log_probs=True` "
+                "and is not currently supported in Server mode (agent loop)."
+            )
-            assert "rollout_log_probs" in data.batch.keys(), (
-                "Truncated Importance Sampling (TIS) requires to configure "
-                "`actor_rollout_ref.rollout.calculate_log_probs=True` "
-                "and is not currently supported in Server mode (agent loop)."
-            )
+            assert data.batch is not None and "rollout_log_probs" in data.batch, (
+                "Truncated Importance Sampling (TIS) requires to configure "
+                "`actor_rollout_ref.rollout.calculate_log_probs=True` "
+                "and is not currently supported in Server mode (agent loop)."
+            )
-            assert "rollout_log_probs" in data.batch.keys(), (
-                "Truncated Importance Sampling (TIS) requires to configure "
-                "`actor_rollout_ref.rollout.calculate_log_probs=True` "
-                "and is not currently supported in Server mode (agent loop)."
-            )
+            assert data.batch is not None and "rollout_log_probs" in data.batch, (
+                "Truncated Importance Sampling (TIS) requires to configure "
+                "`actor_rollout_ref.rollout.calculate_log_probs=True` "
+                "and is not currently supported in Server mode (agent loop)."
+            )
+            select_keys.append("rollout_log_probs")
         self.has_multi_modal_inputs = "multi_modal_inputs" in data.non_tensor_batch.keys()
         if self.has_multi_modal_inputs:
             data = data.select(select_keys, ["multi_modal_inputs"])
@@ -309,6 +316,7 @@ def compute_ppo_loss(self, model_output, data):
         response_mask = data["response_mask"].to(bool)
         # compute policy loss
         old_log_prob = data["old_log_probs"]
+        rollout_log_probs = data["rollout_log_probs"] if self.config.tis_imp_ratio_cap > 0 else None
         advantages = data["advantages"]
 
         loss_agg_mode = self.config.loss_agg_mode
@@ -323,6 +331,7 @@ def compute_ppo_loss(self, model_output, data):
             response_mask=response_mask,
             loss_agg_mode=loss_agg_mode,
             config=self.config,
+            rollout_log_probs=rollout_log_probs,
         )
 
         metrics.update(