volcengine
diff --git a/‎docs/api/single_controller.rst‎
Lines changed: 4 additions & 2 deletions b/‎docs/api/single_controller.rst‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/api/trainer.rst‎
Lines changed: 11 additions & 4 deletions b/‎docs/api/trainer.rst‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎docs/api/utils.rst‎
Lines changed: 70 additions & 4 deletions b/‎docs/api/utils.rst‎
Lines changed: 70 additions & 4 deletions
diff --git a/‎docs/conf.py‎
Lines changed: 4 additions & 0 deletions b/‎docs/conf.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 2 additions & 1 deletion b/‎docs/index.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/ray_cpu/test_ray_utils.py‎
Lines changed: 54 additions & 0 deletions b/‎tests/ray_cpu/test_ray_utils.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tests/sandbox/test_sandbox.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/sandbox/test_sandbox.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tests/utils/gpu_tests/megatron/test_pipeline_parallel.py‎
Lines changed: 47 additions & 0 deletions b/‎tests/utils/gpu_tests/megatron/test_pipeline_parallel.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎verl/trainer/ppo/core_algos.py‎
Lines changed: 12 additions & 15 deletions b/‎verl/trainer/ppo/core_algos.py‎
Lines changed: 12 additions & 15 deletions
@@ -22,5 +22,7 @@ Core APIs
 .. autoclass:: verl.single_controller.ResourcePool
    :members: __init__, world_size, local_world_size_list, local_rank_list
 
-.. automodule:: verl.single_controller.ray
-   :members: RayWorkerGroup, create_colocated_worker_cls
+.. autoclass:: verl.single_controller.ray.RayWorkerGroup
+   :members: __init__
+
+.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls
@@ -1,5 +1,5 @@
-Trainers
-=========================
+Trainer Interface
+================================
 
 Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
 
@@ -13,9 +13,16 @@ Core APIs
 ~~~~~~~~~~~~~~~~~
 
 .. autoclass::  verl.trainer.ppo.ray_trainer.RayPPOTrainer
+   :members: __init__, init_workers, fit
+
 
 .. automodule:: verl.utils.tokenizer
    :members: hf_tokenizer
 
-.. automodule:: verl.single_controller
-   :members: Worker, WorkerGroup, ClassWithInitArgs, ResourcePool
+
+.. automodule:: verl.trainer.ppo.core_algos
+   :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
+
+
+.. automodule:: verl.trainer.ppo.reward
+   :members: load_reward_manager, compute_reward, compute_reward_async
@@ -1,8 +1,74 @@
-Training utils
-=========================
+Utilities
+============
 
-Core APIs
-~~~~~~~~~~~~~~~~~
+This section documents the utility functions and classes in the VERL library.
+
+Python Functional Utilities
+------------------------------
+
+.. automodule:: verl.utils.py_functional
+   :members: append_to_dict
+
+File System Utilities
+------------------------
+
+.. automodule:: verl.utils.fs
+   :members: copy_to_local
+
+Tracking Utilities
+---------------------
+
+.. automodule:: verl.utils.tracking
+   :members: Tracking
+
+Metrics Utilities
+---------------------
 
 .. automodule::  verl.utils.metric
    :members: reduce_metrics
+
+Checkpoint Management
+------------------------
+
+.. automodule:: verl.utils.checkpoint.checkpoint_manager
+   :members: find_latest_ckpt_path
+
+.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
+   :members: FSDPCheckpointManager
+
+Dataset Utilities
+---------------------
+
+.. automodule:: verl.utils.dataset.rl_dataset
+   :members: RLHFDataset, collate_fn
+
+Torch Functional Utilities
+-----------------------------
+
+.. automodule:: verl.utils.torch_functional
+   :members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
+
+Sequence Length Balancing
+----------------------------
+
+.. automodule:: verl.utils.seqlen_balancing
+   :members: get_reverse_idx, rearrange_micro_batches
+
+Ulysses Utilities
+--------------------
+
+.. automodule:: verl.utils.ulysses
+   :members: gather_outpus_and_unpad, ulysses_pad_and_slice_inputs
+
+FSDP Utilities
+------------------
+
+.. automodule:: verl.utils.fsdp_utils
+   :members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
+
+Debug Utilities
+-------------------
+
+.. automodule:: verl.utils.debug
+   :members: log_gpu_memory_usage, GPUMemoryLogger
+
@@ -48,7 +48,11 @@
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.autosectionlabel",
+    "sphinx.ext.napoleon",
 ]
+# Use Google style docstrings instead of NumPy docstrings.
+napoleon_google_docstring = True
+napoleon_numpy_docstring = False
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 
@@ -108,8 +108,9 @@ verl is fast with:
    :caption: API References
 
    api/data
-   api/utils
    api/single_controller.rst
+   api/trainer.rst
+   api/utils.rst
 
 
 .. toctree::
 
@@ -0,0 +1,54 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import ray
+
+from verl.utils.ray_utils import parallel_put
+
+
+# Initialize Ray for testing if not already done globally
+@pytest.fixture()
+def init_ray():
+    ray.init(num_cpus=4)
+    yield
+    ray.shutdown()
+
+
+def test_parallel_put_basic(init_ray):
+    data = [1, "hello", {"a": 2}, [3, 4]]
+    refs = parallel_put(data)
+    assert len(refs) == len(data)
+    retrieved_data = [ray.get(ref) for ref in refs]
+    assert retrieved_data == data
+
+
+def test_parallel_put_empty(init_ray):
+    data = []
+    refs = parallel_put(data)
+    assert len(refs) == 0
+
+
+def test_parallel_put_workers(init_ray):
+    data = list(range(20))
+    # Test with specific number of workers
+    refs = parallel_put(data, max_workers=4)
+    assert len(refs) == len(data)
+    retrieved_data = [ray.get(ref) for ref in refs]
+    assert retrieved_data == data
+    # Test with default workers (should cap)
+    refs_default = parallel_put(data)
+    assert len(refs_default) == len(data)
+    retrieved_data_default = [ray.get(ref) for ref in refs_default]
+    assert retrieved_data_default == data
@@ -18,7 +18,7 @@
 
 import pytest
 
-from verl.utils.reward_score import _default_compute_score, prime_code, sandbox_fusion
+from verl.utils.reward_score import default_compute_score, prime_code, sandbox_fusion
 from verl.utils.reward_score.prime_code import apps_check_correctness
 from verl.workers.reward_manager.prime import parallel_compute_score_async
 
@@ -109,7 +109,7 @@ def test_parallelism():
         ground_truth.extend(prime_math_gts)
         data_sources.extend(["numina_aops_forum"] * len(prime_math_answers))
 
-    scores = asyncio.run(parallel_compute_score_async(_default_compute_score, sequences_str, ground_truth, data_sources, num_processes=16))
+    scores = asyncio.run(parallel_compute_score_async(default_compute_score, sequences_str, ground_truth, data_sources, num_processes=16))
     print(scores)
 
 
@@ -119,7 +119,7 @@ def test_prime_code():
     """
     data_source = "codecontests"
     for completion, ground_truth, score_ in zip(prime_code_answers, prime_code_gts, prime_code_scores):
-        score = _default_compute_score(data_source, completion, ground_truth)
+        score = default_compute_score(data_source, completion, ground_truth)
         assert float(score) == score_
 
 
@@ -135,7 +135,7 @@ def test_prime_code_sandbox_fusion():
     # Removed the previous 'if not sandbox_url' check block
 
     for completion, ground_truth, score_ in zip(prime_code_answers, prime_code_gts, prime_code_scores):
-        score = _default_compute_score(data_source, completion, ground_truth, extra_info={"sandbox_fusion_url": sandbox_fusion_url})  # <-- Use the URL obtained from the environment variable
+        score = default_compute_score(data_source, completion, ground_truth, extra_info={"sandbox_fusion_url": sandbox_fusion_url})  # <-- Use the URL obtained from the environment variable
         assert float(score) == score_
 
 
@@ -153,7 +153,7 @@ def test_continuous_score_consistency():
     prime_score, _ = sandbox_fusion.compute_score(os.environ.get("SANDBOX_FUSION_URL"), None, completion, ground_truth, continuous=True)
 
     # 2. Calculate score using sandbox_fusion with continuous=True
-    # Ensure the extra_info key triggers the sandbox_fusion path in _default_compute_score
+    # Ensure the extra_info key triggers the sandbox_fusion path in default_compute_score
     fusion_score, _ = prime_code.compute_score(completion, ground_truth, continuous=True)
 
     # 3. Assert scores are equal (using pytest.approx for float comparison)
@@ -175,5 +175,5 @@ def test_check_correctness():
 def test_prime_math():
     data_source = "numina_aops_forum"
     for completion, ground_truth in zip(prime_math_answers, prime_math_gts):
-        score = _default_compute_score(data_source, completion, ground_truth)
+        score = default_compute_score(data_source, completion, ground_truth)
         assert float(score) == 1.0
@@ -0,0 +1,47 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from verl.utils.megatron.pipeline_parallel import make_batch_generator
+
+
+def test_make_batch_generator_no_vpp():
+    batches = [1, 2, 3]
+    vpp_size = 1
+    generator = make_batch_generator(batches, vpp_size)
+    assert list(generator) == batches
+
+
+def test_make_batch_generator_with_vpp():
+    batches = [{"data": 1}, {"data": 2}]
+    vpp_size = 2
+    generators = make_batch_generator(batches, vpp_size)
+    assert isinstance(generators, list)
+    assert len(generators) == vpp_size
+
+    # Check each generator yields the original batches
+    for gen in generators:
+        assert list(gen) == batches
+
+
+def test_make_batch_generator_empty():
+    batches = []
+    vpp_size = 1
+    generator = make_batch_generator(batches, vpp_size)
+    assert list(generator) == []
+
+    vpp_size = 3
+    generators = make_batch_generator(batches, vpp_size)
+    assert len(generators) == vpp_size
+    for gen in generators:
+        assert list(gen) == []
@@ -75,12 +75,12 @@ def compute_gae_advantage_return(
 
     Args:
         token_level_rewards: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
         values: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
         response_mask: `(torch.Tensor)`
-            shape: (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
-        gamma: `(float)`
+            shape is (bs, response_length). [EOS] mask. The token after [EOS] have mask zero.
+        gamma is `(float)`
             discounted factor used in RL
         lam: `(float)`
             lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
@@ -122,19 +122,19 @@ def compute_grpo_outcome_advantage(
     (with only one scalar reward for each response).
     Args:
         token_level_rewards: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
         response_mask: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
         norm_adv_by_std_in_grpo: (bool)
             whether to scale the GRPO advantage.
             If True, the advantage is scaled by the std, as in the original GRPO.
             If False, the advantage is not scaled, as in Dr.GRPO (https://arxiv.org/abs/2503.20783).
 
     Returns:
         advantages: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
         Returns: `(torch.Tensor)`
-            shape: (bs, response_length)
+            shape is (bs, response_length)
     """
     scores = token_level_rewards.sum(dim=-1)
 
@@ -371,15 +371,12 @@ def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str
     """
     Aggregate the loss matrix into a scalar.
     Args:
-        loss_mat: `(torch.Tensor)`
+        loss_mat: `(torch.Tensor)`:
             shape: (bs, response_length)
-        loss_mask: `(torch.Tensor)`
+        loss_mask: `(torch.Tensor)`:
             shape: (bs, response_length)
-        loss_agg_mode: (str) choices: "token-mean" /
-                                      "seq-mean-token-sum" /
-                                      "seq-mean-token-mean" /
-                                      "seq-mean-token-sum-norm" /
-            "token-mean" is the default behavior
+        loss_agg_mode: (str) choices:
+            method to aggregate the loss matrix into a scalar.
     Returns:
         loss: `a scalar torch.Tensor`
             aggregated loss