[misc] fix: Handle N-D arrays and complex objects in union_numpy_dict (volcengine#2768)

Tyizhanshen · Tyizhanshen · commit 887b16d3c39c · 2025-07-28T11:28:40.000+08:00
### What does this PR do? This PR fixes a bug in `verl.protocol.union_numpy_dict` where it would crash on NumPy arrays with more than 2 dimensions. It replaces the underlying comparison logic with a robust, recursive function that can handle N-D arrays, nested objects, `NaN` values, and circular references. This resolves issue volcengine#2766. ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test A comprehensive unit test suite has been added to `tests/test_protocol_on_cpu.py`. The new tests cover the following scenarios, all of which now pass: * Merging dictionaries with identical 3D (and higher) dimensional arrays. * Correctly failing when N-D arrays with the same shape but different values are merged. * Handling nested `object`-dtype arrays containing other arrays, strings, and `None`. * Correctly treating `NaN` values at the same position as equal, mimicking pandas' behavior. * Safely handling circular references without causing a `RecursionError`. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/tests/test_protocol_on_cpu.py b/tests/test_protocol_on_cpu.py
@@ -33,21 +33,99 @@ def test_union_tensor_dict():
         {"obs": obs.clone(), "next_obs": torch.randn(100, 10), "rew": torch.randn(100)}, batch_size=[100]
     )
 
-    data = union_tensor_dict(data1, data2)
+    union_tensor_dict(data1, data2)
     with pytest.raises(AssertionError):
-        data = union_tensor_dict(data1, data_with_copied_obs)
-
+        union_tensor_dict(data1, data_with_copied_obs)
+
+
+def test_union_numpy_dict():
+    """
+    A comprehensive test suite for union_numpy_dict, covering standard use
+    cases, N-dimensional arrays, object-dtype arrays, and NaN value handling.
+    """
+    arr_3d = np.arange(8).reshape((2, 2, 2))
+    union_numpy_dict({"a": arr_3d}, {"a": arr_3d})
+    arr1 = np.array([1, "hello", np.array([2, 3])], dtype=object)
+    arr2 = np.array([1, "hello", np.array([2, 3])], dtype=object)
+    union_numpy_dict({"a": arr1}, {"a": arr2})
+    # --- Test Case 1: The original test with mixed object/float types ---
+    # This test case from the original test file is preserved.
     data = np.random.random(100)
-    data2 = [float("nan") for _ in range(99)]
-    data2.append("nan")
-    data2 = np.array(data2, dtype=object)
-    data3 = np.tile(data2, (2, 1))
-    a = {"a": data, "b": data2, "c": data3}
-    b = {"a": data, "b": data2, "c": data3}
-    b_ = {"a": np.random.random(100)}
-    union_numpy_dict(a, b)
+    # This array intentionally mixes float('nan') and the string 'nan'
+    nan_data = [float("nan") for _ in range(99)]
+    nan_data.append("nan")
+    nan_data_arr = np.array(nan_data, dtype=object)
+
+    dict1 = {"a": data, "b": nan_data_arr}
+    dict2_same = {"a": data.copy(), "b": nan_data_arr.copy()}
+    dict3_different = {"a": np.random.random(100)}
+
+    union_numpy_dict(dict1, dict2_same)  # Should pass
+    with pytest.raises(AssertionError):
+        union_numpy_dict(dict1, dict3_different)
+
+    # --- Test Case 2: Standard 3D arrays (fixes the core bug) ---
+    arr_3d = np.arange(24, dtype=np.int32).reshape((2, 3, 4))
+    dict_3d_1 = {"nd_array": arr_3d}
+    dict_3d_2_same = {"nd_array": arr_3d.copy()}
+    dict_3d_3_different = {"nd_array": arr_3d + 1}
+
+    union_numpy_dict(dict_3d_1, dict_3d_2_same)  # Should pass
+    with pytest.raises(AssertionError, match="`nd_array` in tensor_dict1 and tensor_dict2 are not the same object."):
+        union_numpy_dict(dict_3d_1, dict_3d_3_different)
+
+    # --- Test Case 3: Nested 2D and 4D object-dtype arrays ---
+    sub_arr1 = np.array([1, 2])
+    sub_arr2 = np.array([3.0, 4.0])
+    # 2D object array
+    arr_2d_obj = np.array([[sub_arr1, "text"], [sub_arr2, None]], dtype=object)
+    arr_2d_obj_diff = np.array([[sub_arr1, "text"], [sub_arr2, "other"]], dtype=object)
+
+    union_numpy_dict({"data": arr_2d_obj}, {"data": arr_2d_obj.copy()})  # Should pass
+    with pytest.raises(AssertionError):
+        union_numpy_dict({"data": arr_2d_obj}, {"data": arr_2d_obj_diff})
+
+    # 4D object array to ensure deep recursion is robust
+    arr_4d_obj = np.array([[[[sub_arr1]]], [[[sub_arr2]]]], dtype=object)
+    arr_4d_obj_diff = np.array([[[[sub_arr1]]], [[[np.array([9, 9])]]]], dtype=object)
+
+    union_numpy_dict({"data": arr_4d_obj}, {"data": arr_4d_obj.copy()})  # Should pass
+    with pytest.raises(AssertionError):
+        union_numpy_dict({"data": arr_4d_obj}, {"data": arr_4d_obj_diff})
+
+    # --- Test Case 4: Explicit NaN value comparison ---
+    # This verifies that our new _deep_equal logic correctly handles NaNs.
+    nan_arr = np.array([1.0, np.nan, 3.0])
+    dict_nan_1 = {"data": nan_arr}
+    dict_nan_2_same = {"data": np.array([1.0, np.nan, 3.0])}  # A new array with same values
+    dict_nan_3_different_val = {"data": np.array([1.0, 2.0, 3.0])}
+    dict_nan_4_different_pos = {"data": np.array([np.nan, 1.0, 3.0])}
+
+    # NaNs in the same position should be considered equal for merging.
+    union_numpy_dict(dict_nan_1, dict_nan_2_same)  # Should pass
+
+    with pytest.raises(AssertionError):
+        union_numpy_dict(dict_nan_1, dict_nan_3_different_val)
+    with pytest.raises(AssertionError):
+        union_numpy_dict(dict_nan_1, dict_nan_4_different_pos)
+
+    # --- Test Case 5: Circular reference handling ---
+    # Create two separate, but structurally identical, circular references.
+    # This should pass without a RecursionError.
+    circ_arr_1 = np.array([None], dtype=object)
+    circ_arr_1[0] = circ_arr_1
+
+    circ_arr_2 = np.array([None], dtype=object)
+    circ_arr_2[0] = circ_arr_2
+
+    union_numpy_dict({"data": circ_arr_1}, {"data": circ_arr_2})  # Should pass
+
+    # Create a circular reference and a non-circular one.
+    # This should fail with an AssertionError because they are different.
+    non_circ_arr = np.array([None], dtype=object)
+
     with pytest.raises(AssertionError):
-        union_numpy_dict(a, b_)
+        union_numpy_dict({"data": circ_arr_1}, {"data": non_circ_arr})
 
 
 def test_tensor_dict_constructor():
diff --git a/verl/protocol.py b/verl/protocol.py
@@ -19,13 +19,13 @@
 import contextlib
 import copy
 import logging
+import math
 import os
 import pickle
 from dataclasses import dataclass, field
-from typing import Callable, Optional
+from typing import Any, Callable, Optional
 
 import numpy as np
-import pandas as pd
 import ray
 import tensordict
 import torch
@@ -118,14 +118,77 @@ def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> Ten
     return tensor_dict1
 
 
+def _array_equal(array1: np.ndarray, array2: np.ndarray, visited: set[int]) -> bool:
+    """
+    Recursively compares two NumPy arrays for strict equality, with special
+    handling for object-dtype arrays, NaN values, and circular references.
+    This function assumes that the two arguments provided are NumPy arrays.
+
+    Args:
+        array1: The first NumPy array.
+        array2: The second NumPy array.
+
+    Returns:
+        True if the arrays' dtypes, shapes, and all elements are equal.
+    """
+    # Check dtype and shape first, as this is the fastest failure path.
+    if array1.dtype != array2.dtype or array1.shape != array2.shape:
+        return False
+
+    # For non-object dtypes, use NumPy's implementation with equal_nan=True.
+    if array1.dtype != "object":
+        return np.array_equal(array1, array2, equal_nan=True)
+
+    # For object-dtype arrays, we must recursively compare each element.
+    # We delegate to _deep_equal to handle elements, as they could be any
+    # type, including other nested arrays or NaNs.
+    return all(_deep_equal(x, y, visited) for x, y in zip(array1.flat, array2.flat, strict=False))
+
+
+def _deep_equal(a: Any, b: Any, visited: set[int]) -> bool:
+    """
+    Recursively performs a deep comparison between two Python objects.
+    - Handles NaN values correctly (NaN == NaN evaluates to True).
+    - Handling circular references.
+    - Dispatches to _array_equal if both objects are NumPy arrays.
+    - Otherwise, uses standard '==' comparison.
+    """
+    if type(a) is not type(b):
+        return False
+
+    # If we have seen this object ID before on this path, it's a cycle.
+    # Since we already know the types match, we can safely assume this part
+    # of the structure is equal.
+    obj_id = id(a)
+    if obj_id in visited:
+        return True
+
+    visited.add(obj_id)
+
+    # Perform the specific comparison based on type
+    result = False
+    if isinstance(a, float) and math.isnan(a) and math.isnan(b):
+        result = True
+    elif isinstance(a, np.ndarray):
+        # We know b is also an ndarray due to the initial type check
+        result = _array_equal(a, b, visited)
+    else:
+        # Standard equality for all other types
+        result = a == b
+
+    # Clean up the visited set on the way out of the recursion
+    visited.remove(obj_id)
+    return result
+
+
 def union_numpy_dict(tensor_dict1: dict[str, np.ndarray], tensor_dict2: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
     for key, val in tensor_dict2.items():
         if key in tensor_dict1:
             assert isinstance(tensor_dict2[key], np.ndarray)
             assert isinstance(tensor_dict1[key], np.ndarray)
             # to properly deal with nan and object type
-            assert pd.DataFrame(tensor_dict2[key]).equals(pd.DataFrame(tensor_dict1[key])), (
-                f"{key} in tensor_dict1 and tensor_dict2 are not the same object"
+            assert _deep_equal(tensor_dict1[key], tensor_dict2[key], visited=set()), (
+                f"`{key}` in tensor_dict1 and tensor_dict2 are not the same object."
             )
         tensor_dict1[key] = val