fix: handle list of metric dicts in reduce_metrics after DataProto.concat()

szrlee · szrlee · commit 07b0efce7b61 · 2025-10-09T11:02:53.000+08:00
When multiple workers return metrics, DataProto.concat() aggregates them into a list of dicts (introduced in b3c6274). This caused reduce_metrics() to fail with AttributeError: 'list' object has no attribute 'items'. Changes: - Update reduce_metrics() to accept both dict and list of dicts - Merge list of metric dicts before applying reduction operations - Maintain backward compatibility with existing dict input - Add comprehensive tests for new list input handling Fixes the error: File "verl/trainer/ppo/ray_trainer.py", line 1129, in fit critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"]) AttributeError: 'list' object has no attribute 'items'
diff --git a/tests/trainer/ppo/test_metric_utils_on_cpu.py b/tests/trainer/ppo/test_metric_utils_on_cpu.py
@@ -66,6 +66,45 @@ def test_reduce_metrics_single_value(self):
 
         self.assertEqual(result["single"], 5.0)
 
+    def test_reduce_metrics_list_of_dicts(self):
+        """Test that reduce_metrics handles list of dicts from multiple workers."""
+        # Simulate metrics from multiple workers (e.g., after DataProto.concat)
+        metrics_list = [
+            {"loss": [1.0, 2.0], "accuracy": [0.8, 0.9]},
+            {"loss": [3.0, 4.0], "accuracy": [0.7, 0.6]},
+        ]
+        result = reduce_metrics(metrics_list)
+
+        # All values should be merged and averaged
+        self.assertAlmostEqual(result["loss"], 2.5)  # mean of [1.0, 2.0, 3.0, 4.0]
+        self.assertAlmostEqual(result["accuracy"], 0.75)  # mean of [0.8, 0.9, 0.7, 0.6]
+
+    def test_reduce_metrics_list_of_dicts_with_scalars(self):
+        """Test that reduce_metrics handles list of dicts with scalar values."""
+        # Simulate metrics from multiple workers where each worker has scalar values
+        metrics_list = [
+            {"loss": 1.0, "accuracy": 0.8},
+            {"loss": 3.0, "accuracy": 0.6},
+        ]
+        result = reduce_metrics(metrics_list)
+
+        # All values should be merged and averaged
+        self.assertEqual(result["loss"], 2.0)  # mean of [1.0, 3.0]
+        self.assertEqual(result["accuracy"], 0.7)  # mean of [0.8, 0.6]
+
+    def test_reduce_metrics_list_with_max_min_keys(self):
+        """Test that reduce_metrics correctly applies max/min reduction for list input."""
+        metrics_list = [
+            {"max_reward": [5.0, 8.0], "min_error": [0.1, 0.05]},
+            {"max_reward": [6.0, 7.0], "min_error": [0.2, 0.15]},
+        ]
+        result = reduce_metrics(metrics_list)
+
+        # max_reward should use max aggregation
+        self.assertEqual(result["max_reward"], 8.0)  # max of [5.0, 8.0, 6.0, 7.0]
+        # min_error should use min aggregation
+        self.assertEqual(result["min_error"], 0.05)  # min of [0.1, 0.05, 0.2, 0.15]
+
 
 class TestComputeDataMetrics(unittest.TestCase):
     """Tests for the compute_data_metrics function."""
diff --git a/verl/utils/metric/utils.py b/verl/utils/metric/utils.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 
-def reduce_metrics(metrics: dict[str, list[Any]]) -> dict[str, Any]:
+def reduce_metrics(metrics: dict[str, list[Any]] | list[dict[str, Any]]) -> dict[str, Any]:
     """
     Reduces a dictionary of metric lists by computing the mean, max, or min of each list.
     The reduce operation is determined by the key name:
@@ -29,7 +29,9 @@ def reduce_metrics(metrics: dict[str, list[Any]]) -> dict[str, Any]:
     - Otherwise, np.mean is used
 
     Args:
-        metrics: A dictionary mapping metric names to lists of metric values.
+        metrics: Either:
+            - A dictionary mapping metric names to lists of metric values, or
+            - A list of dictionaries from multiple workers (e.g., after DataProto.concat())
 
     Returns:
         A dictionary with the same keys but with each list replaced by its reduced value.
@@ -43,7 +45,30 @@ def reduce_metrics(metrics: dict[str, list[Any]]) -> dict[str, Any]:
         ... }
         >>> reduce_metrics(metrics)
         {"loss": 2.0, "accuracy": 0.8, "max_reward": 8.0, "min_error": 0.05}
+
+        >>> metrics_list = [
+        ...     {"loss": [1.0, 2.0], "accuracy": [0.8, 0.9]},
+        ...     {"loss": [3.0, 4.0], "accuracy": [0.7, 0.6]}
+        ... ]
+        >>> reduce_metrics(metrics_list)
+        {"loss": 2.5, "accuracy": 0.75}
     """
+    # Handle list of dicts (from multiple workers after DataProto.concat)
+    if isinstance(metrics, list):
+        # Merge all metric dicts into a single dict with lists
+        merged_metrics = {}
+        for worker_metrics in metrics:
+            for key, val in worker_metrics.items():
+                if key not in merged_metrics:
+                    merged_metrics[key] = []
+                # val could be a single value or a list
+                if isinstance(val, list):
+                    merged_metrics[key].extend(val)
+                else:
+                    merged_metrics[key].append(val)
+        metrics = merged_metrics
+
+    # Now reduce the dict of lists
     for key, val in metrics.items():
         if "max" in key:
             metrics[key] = np.max(val)