Merge pull request #317 from listar2000/fix-color-print-display-issue

jeffreysijuntan · web-flow · commit 311518e4504e · 2025-11-23T11:49:59.000-08:00
Fix color print display issue
diff --git a/rllm/engine/agent_execution_engine.py b/rllm/engine/agent_execution_engine.py
@@ -17,8 +17,8 @@
     compute_mc_return,
     compute_trajectory_reward,
 )
-from rllm.misc import colorful_print
 from rllm.parser import ChatTemplateParser
+from rllm.utils import colorful_print
 
 logger = logging.getLogger(__name__)
 
diff --git a/rllm/engine/agent_workflow_engine.py b/rllm/engine/agent_workflow_engine.py
@@ -11,7 +11,7 @@
 
 from rllm.agents.agent import Episode
 from rllm.engine.rollout import ModelOutput, RolloutEngine
-from rllm.misc import colorful_print
+from rllm.utils import colorful_print
 from rllm.workflows.workflow import TerminationReason, Workflow
 
 # Avoid hard dependency on verl at import time; only for typing
diff --git a/rllm/misc.py b/rllm/misc.py
@@ -3,22 +3,11 @@
 """
 
 import random
-import warnings
 
-import click
 import numpy as np
 from PIL import Image
 
 
-def colorful_print(string: str, *args, **kwargs) -> None:
-    end = kwargs.pop("end", "\n")
-    print(click.style(string, *args, **kwargs), end=end, flush=True)
-
-
-def colorful_warning(string: str, *args, **kwargs) -> None:
-    warnings.warn(click.style(string, *args, **kwargs), stacklevel=2)
-
-
 def get_image(image_path):
     with Image.open(image_path) as img:
         return img.convert("RGB")
diff --git a/rllm/trainer/verl/agent_ppo_trainer.py b/rllm/trainer/verl/agent_ppo_trainer.py
@@ -710,107 +710,24 @@ def _transform_agent_trajectories(self, trajectories: list[dict]):
 
     def visualize_trajectory(self, tensor_batch, sample_idx=0, max_samples=1, mask_key="response_mask"):
         """
-        Visualize the trajectory from tensor_batch by detokenizing prompts and responses,
-        and highlighting the masked parts with color.
-
-        Args:
-            tensor_batch: The tensor batch containing trajectory data
-            sample_idx: Starting index of samples to visualize
-            max_samples: Maximum number of samples to visualize
+        Visualize the trajectory from tensor_batch using the shared visualization utility.
         """
-        from rllm.misc import colorful_print
-
-        # Get the relevant tensors
-        prompts = tensor_batch.batch["prompts"]
-        responses = tensor_batch.batch["responses"]
-        traj_mask = tensor_batch.batch[mask_key]
-        token_level_scores = tensor_batch.batch["token_level_scores"]
-
-        # Full attention mask (covers prompt + response); split it into prompt and response parts
-        full_attn_mask = tensor_batch.batch["attention_mask"]
-        prompt_len = prompts.shape[1]
-        resp_len = responses.shape[1]
-        prompt_attn_mask = full_attn_mask[:, :prompt_len]
-        response_attn_mask = full_attn_mask[:, -resp_len:]
-
-        batch_size = prompts.shape[0]
-        end_idx = min(sample_idx + max_samples, batch_size)
-
-        for i in range(sample_idx, end_idx):
-            colorful_print("\n" + "=" * 60, fg="cyan", bold=True)
-            colorful_print(f"Sample {i}", fg="cyan", bold=True)
-
-            # Legend before the example
-            legend = " ".join(
-                [
-                    "\x1b[37mwhite=masked\x1b[0m",
-                    "\x1b[34mblue=unmasked\x1b[0m",
-                    "\x1b[42m green bg=reward>0 \x1b[0m",
-                    "\x1b[41m red bg=reward<=0 \x1b[0m",
-                ]
-            )
-            print(f"[{legend}]")
-
-            # Detokenize prompt
-            prompt_tokens = prompts[i]
-            prompt_valid_mask = prompt_attn_mask[i].bool()
-            # Build one-line colored prompt (prompt is always masked-from-loss => white)
-            prompt_parts = []
-            for tok_id, is_valid in zip(prompt_tokens.tolist(), prompt_valid_mask.tolist(), strict=False):
-                if not is_valid:
-                    continue
-                tok = self.tokenizer.decode([tok_id]).replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
-                prompt_parts.append(f"\x1b[37m{tok}\x1b[0m")  # white
-            print("".join(prompt_parts))
-
-            # Separator line between prompt and response for readability
-            print("----------------")
-
-            # Detokenize response with token-level highlighting
-            resp_tokens = responses[i]
-            resp_valid_mask = response_attn_mask[i].bool()
-            loss_mask = traj_mask[i]
-            rewards = token_level_scores[i]
-
-            # Pre-compute reward positions (typically only the last valid resp token has nonzero reward)
-            reward_idx = None
-            reward_value = 0.0
-            if rewards is not None:
-                # consider only valid response positions
-                for j, is_valid in enumerate(resp_valid_mask.tolist()):
-                    if not is_valid:
-                        continue
-                    val = float(rewards[j].item()) if hasattr(rewards[j], "item") else float(rewards[j])
-                    if abs(val) > 1e-9:
-                        reward_idx = j
-                        reward_value = val
-
-            # Fallback: if no nonzero reward found, use the last valid response token
-            if reward_idx is None:
-                valid_indices = [idx for idx, v in enumerate(resp_valid_mask.tolist()) if v]
-                if valid_indices:
-                    reward_idx = valid_indices[-1]
-                    if rewards is not None:
-                        val = float(rewards[reward_idx].item()) if hasattr(rewards[reward_idx], "item") else float(rewards[reward_idx])
-                        reward_value = val
-
-            # Colors: white for masked-from-loss; blue for contributes-to-loss; overlay background red/green if reward token
-            response_parts = []
-            for j, tok_id in enumerate(resp_tokens.tolist()):
-                if not bool(resp_valid_mask[j].item() if hasattr(resp_valid_mask[j], "item") else resp_valid_mask[j]):
-                    continue
-                tok = self.tokenizer.decode([tok_id]).replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
-
-                contributes = bool(loss_mask[j].item()) if hasattr(loss_mask[j], "item") else bool(loss_mask[j])
-                fg = "\x1b[34m" if contributes else "\x1b[37m"  # blue if in loss, else white
-
-                bg = ""
-                if reward_idx is not None and j == reward_idx:
-                    bg = "\x1b[42m" if reward_value > 0 else "\x1b[41m"  # green background for positive, red for negative/zero
-
-                response_parts.append(f"{bg}{fg}{tok}\x1b[0m")
-
-            print("".join(response_parts))
+        from rllm.utils.visualization import visualize_trajectories
+
+        if len(tensor_batch) == 0:
+            return
+
+        end_idx = min(sample_idx + max_samples, len(tensor_batch))
+        indices = list(range(sample_idx, end_idx))
+
+        visualize_trajectories(
+            batch=tensor_batch,
+            tokenizer=self.tokenizer,
+            sample_indices=indices,
+            mask_key=mask_key,
+            reward_key="token_level_scores",
+            show_workflow_metadata=False,
+        )
 
     def generate_agent_trajectories_async(self, timing_raw=None, meta_info=None, mode="Token"):
         """
diff --git a/rllm/trainer/verl/agent_workflow_trainer.py b/rllm/trainer/verl/agent_workflow_trainer.py
@@ -662,126 +662,27 @@ def shutdown(self):
 
     def visualize_trajectory_last_step(self, tensor_batch, sample_idx=0, max_samples=1):
         """
-        Visualize last steps from a workflow rollout:
-        - detokenize prompts/responses
-        - show token usage mask
-        - show reward tokens (placed at the last response token)
-        - print Correct/Incorrect using `is_correct` from non_tensors
+        Visualize last steps from a workflow rollout using the shared visualization utility.
         """
-        from rllm.misc import colorful_print
+        from rllm.utils.visualization import visualize_trajectories
 
         # Select only last steps if stepwise-advantage is enabled
         if "is_last_step" in tensor_batch.non_tensor_batch:
             is_last = tensor_batch.non_tensor_batch["is_last_step"]
             if is_last is not None and len(is_last) == len(tensor_batch):
                 tensor_batch = tensor_batch[is_last]
 
-        prompts = tensor_batch.batch["prompts"]
-        responses = tensor_batch.batch["responses"]
-        # Full attention mask (covers prompt + response); split it into prompt and response parts
-        full_attn_mask = tensor_batch.batch["attention_mask"]
-        prompt_len = prompts.shape[1]
-        resp_len = responses.shape[1]
-        prompt_attn_mask = full_attn_mask[:, :prompt_len]
-        response_attn_mask = full_attn_mask[:, -resp_len:]
-
-        # Loss mask over the response tokens only
-        response_loss_mask = tensor_batch.batch.get("response_mask")
-
-        # Rewards aligned to response tokens
-        token_level_scores = tensor_batch.batch.get("step_rewards" if self.config.rllm.stepwise_advantage.enable and self.config.rllm.stepwise_advantage.mode == "per_step" else "traj_rewards")
-
-        # Optional meta to print outcome
-        is_correct = tensor_batch.non_tensor_batch.get("is_correct", None)
-        term_reasons = tensor_batch.non_tensor_batch.get("termination_reasons", None)
-        episode_ids = tensor_batch.non_tensor_batch.get("episode_ids", None)
-        trajectory_ids = tensor_batch.non_tensor_batch.get("trajectory_ids", None)
-
-        bsz = prompts.shape[0]
-        end_idx = min(sample_idx + max_samples, bsz)
-
-        for i in range(sample_idx, end_idx):
-            colorful_print("\n" + "=" * 60, fg="cyan", bold=True)
-            # Header with ids
-            if episode_ids is not None or trajectory_ids is not None:
-                colorful_print(f"Episode: {episode_ids[i] if episode_ids is not None else '?'}  | Traj: {trajectory_ids[i] if trajectory_ids is not None else '?'}", fg="cyan", bold=True)
-
-            # Outcome line
-            if is_correct is not None:
-                ok = bool(is_correct[i])
-                colorful_print(f"Outcome: {'✓ Correct' if ok else '✗ Incorrect'}", fg=("green" if ok else "red"), bold=True)
-
-            if term_reasons is not None:
-                colorful_print(f"Termination: {term_reasons[i]}", fg="yellow")
-
-            # Legend before the example
-            legend = " ".join(
-                [
-                    "\x1b[37mwhite=masked\x1b[0m",
-                    "\x1b[34mblue=unmasked\x1b[0m",
-                    "\x1b[42m green bg=reward>0 \x1b[0m",
-                    "\x1b[41m red bg=reward<=0 \x1b[0m",
-                ]
-            )
-            print(f"[{legend}]")
-
-            # Detokenize prompt
-            prompt_tokens = prompts[i]
-            prompt_valid_mask = prompt_attn_mask[i].bool()
-            # Build one-line colored prompt (prompt is always masked-from-loss => white)
-            prompt_parts = []
-            for tok_id, is_valid in zip(prompt_tokens.tolist(), prompt_valid_mask.tolist(), strict=False):
-                if not is_valid:
-                    continue
-                tok = self.tokenizer.decode([tok_id]).replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
-                prompt_parts.append(f"\x1b[37m{tok}\x1b[0m")  # white
-            print("".join(prompt_parts))
-
-            # Separator line between prompt and response for readability
-            print("----------------")
-
-            # Detokenize response with token-level highlighting
-            resp_tokens = responses[i]
-            resp_valid_mask = response_attn_mask[i].bool()
-            loss_mask = response_loss_mask[i] if response_loss_mask is not None else resp_valid_mask
-            rewards = token_level_scores[i] if token_level_scores is not None else None
-
-            # Pre-compute reward positions (typically only the last valid resp token has nonzero reward)
-            reward_idx = None
-            reward_value = 0.0
-            if rewards is not None:
-                # consider only valid response positions
-                for j, is_valid in enumerate(resp_valid_mask.tolist()):
-                    if not is_valid:
-                        continue
-                    val = float(rewards[j].item()) if hasattr(rewards[j], "item") else float(rewards[j])
-                    if abs(val) > 1e-9:
-                        reward_idx = j
-                        reward_value = val
-
-            # Fallback: if no nonzero reward found, use the last valid response token
-            if reward_idx is None:
-                valid_indices = [idx for idx, v in enumerate(resp_valid_mask.tolist()) if v]
-                if valid_indices:
-                    reward_idx = valid_indices[-1]
-                    if rewards is not None:
-                        val = float(rewards[reward_idx].item()) if hasattr(rewards[reward_idx], "item") else float(rewards[reward_idx])
-                        reward_value = val
-
-            # Colors: white for masked-from-loss; blue for contributes-to-loss; overlay background red/green if reward token
-            response_parts = []
-            for j, tok_id in enumerate(resp_tokens.tolist()):
-                if not bool(resp_valid_mask[j].item() if hasattr(resp_valid_mask[j], "item") else resp_valid_mask[j]):
-                    continue
-                tok = self.tokenizer.decode([tok_id]).replace("\n", "\\n").replace("\r", "\\r").replace("\t", "\\t")
-
-                contributes = bool(loss_mask[j].item()) if hasattr(loss_mask[j], "item") else bool(loss_mask[j])
-                fg = "\x1b[34m" if contributes else "\x1b[37m"  # blue if in loss, else white
-
-                bg = ""
-                if reward_idx is not None and j == reward_idx:
-                    bg = "\x1b[42m" if reward_value > 0 else "\x1b[41m"  # green background for positive, red for negative/zero
-
-                response_parts.append(f"{bg}{fg}{tok}\x1b[0m")
-
-            print("".join(response_parts))
+        if len(tensor_batch) == 0:
+            return
+
+        end_idx = min(sample_idx + max_samples, len(tensor_batch))
+        indices = list(range(sample_idx, end_idx))
+
+        visualize_trajectories(
+            batch=tensor_batch,
+            tokenizer=self.tokenizer,
+            sample_indices=indices,
+            mask_key="response_mask",
+            reward_key="step_rewards" if self.config.rllm.stepwise_advantage.enable and self.config.rllm.stepwise_advantage.mode == "per_step" else "traj_rewards",
+            show_workflow_metadata=True,
+        )
diff --git a/rllm/utils/__init__.py b/rllm/utils/__init__.py
@@ -2,5 +2,6 @@
 
 from rllm.utils.compute_pass_at_k import compute_pass_at_k
 from rllm.utils.episode_logger import EpisodeLogger
+from rllm.utils.visualization import VisualizationConfig, colorful_print, colorful_warning, visualize_trajectories
 
-__all__ = ["EpisodeLogger", "compute_pass_at_k"]
+__all__ = ["EpisodeLogger", "compute_pass_at_k", "visualize_trajectories", "VisualizationConfig", "colorful_print", "colorful_warning"]
diff --git a/rllm/utils/visualization.py b/rllm/utils/visualization.py

Original file line number	Diff line number	Diff line change
`@@ -17,8 +17,8 @@`
`17`	`17`	`compute_mc_return,`
`18`	`18`	`compute_trajectory_reward,`
`19`	`19`	`)`
`20`		`-from rllm.misc import colorful_print`
`21`	`20`	`from rllm.parser import ChatTemplateParser`
	`21`	`+from rllm.utils import colorful_print`
`22`	`22`
`23`	`23`	`logger = logging.getLogger(__name__)`
`24`	`24`