diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/e2e_eval_aime24.yml index b0da8f2acc2..e674fd10cbd 100644 --- a/.github/workflows/e2e_eval_aime24.yml +++ b/.github/workflows/e2e_eval_aime24.yml @@ -124,7 +124,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test,gpu,math] - pip3 install math-verify + pip3 install math-verify transformers==4.56.2 - name: Prepare aime24 dataset run: | ray stop --force diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml index d1d42f37ccb..b2966e795c3 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml @@ -87,6 +87,7 @@ permissions: env: IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" + TRANSFORMERS_VERSION: "4.56.2" jobs: setup: @@ -121,6 +122,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install math-verify transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k @@ -173,6 +175,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install math-verify transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k @@ -210,6 +213,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install math-verify transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k @@ -241,6 +245,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install math-verify transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml index 07cb1bd9752..3183210de1f 100644 --- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml +++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml @@ -87,6 +87,7 @@ permissions: env: IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2" DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" + TRANSFORMERS_VERSION: "4.56.2" jobs: setup: @@ -121,6 +122,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k @@ -154,6 +156,7 @@ jobs: run: | pip3 install --no-deps -e .[test] pip3 install mbridge + pip3 install transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k @@ -186,6 +189,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test] + pip3 install transformers==$TRANSFORMERS_VERSION - name: Prepare Geo3k dataset run: | python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k/ @@ -220,6 +224,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test,vllm] + pip3 install transformers==$TRANSFORMERS_VERSION - name: Prepare GSM8K dataset run: | ray stop --force @@ -359,7 +364,7 @@ jobs: - name: Install the current repository run: | pip3 install --no-deps -e .[test,gpu,vllm,geo,trl] - pip install "transformers[hf_xet]==4.54.0" + pip3 install transformers==$TRANSFORMERS_VERSION # Geo3k - name: Prepare GEO3K dataset run: | diff --git a/.github/workflows/reward_model.yml b/.github/workflows/reward_model.yml index c69698f8c9d..7ba377194e0 100644 --- a/.github/workflows/reward_model.yml +++ b/.github/workflows/reward_model.yml @@ -58,9 +58,32 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +env: + IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2" + DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" + TRANSFORMERS_VERSION: "4.56.2" + + jobs: + setup: + if: github.repository_owner == 'volcengine' + runs-on: ubuntu-latest + outputs: + runner-label: ${{ steps.create-runner.outputs.runner-label }} + mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} + steps: + - uses: actions/checkout@v4 + - id: create-runner + uses: volcengine/vemlp-github-runner@v1 + with: + mode: "create" + faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" + mlp-image: "${{ env.IMAGE }}" + reward_model: - runs-on: [L20x8] + needs: setup + runs-on: [ "${{ needs.setup.outputs.runner-label || 'L20x8' }}" ] timeout-minutes: 20 # Increase this timeout value as needed env: HTTP_PROXY: ${{ secrets.PROXY_HTTP }} @@ -71,9 +94,6 @@ jobs: SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True" NCCL_SHM_DISABLE: "1" NCCL_P2P_DISABLE: "1" - container: - image: verlai/verl:app-verl0.5-transformers4.55.4-sglang0.4.10.post2-mcore0.13.0-te2.2 - options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: @@ -93,3 +113,19 @@ jobs: run: | unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY pytest -s -x tests/workers/reward_model/test_generative_reward_model.py + + cleanup: + runs-on: ubuntu-latest + needs: + [ + setup, + reward_model + ] + if: always() + steps: + - id: destroy-runner + uses: volcengine/vemlp-github-runner@v1 + with: + mode: "destroy" + faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" + mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" \ No newline at end of file diff --git a/recipe/dapo/dapo_ray_trainer.py b/recipe/dapo/dapo_ray_trainer.py index 8c79fa325b0..a4d5c098353 100644 --- a/recipe/dapo/dapo_ray_trainer.py +++ b/recipe/dapo/dapo_ray_trainer.py @@ -332,6 +332,11 @@ def fit(self): actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"]) metrics.update(actor_output_metrics) + # Log rollout generations if enabled + rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) + if rollout_data_dir: + self._log_rollout_data(batch, reward_extra_infos_dict, timing_raw, rollout_data_dir) + # validate if ( self.val_reward_fn is not None diff --git a/recipe/one_step_off_policy/ray_trainer.py b/recipe/one_step_off_policy/ray_trainer.py index 662f1852646..e192e655ef7 100644 --- a/recipe/one_step_off_policy/ray_trainer.py +++ b/recipe/one_step_off_policy/ray_trainer.py @@ -552,22 +552,7 @@ def fit(self): # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - sample_gts = [ - item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in batch - ] - - self._dump_generations( - inputs=inputs, - outputs=outputs, - gts=sample_gts, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) + self._log_rollout_data(batch, reward_extra_infos_dict, timing_raw, rollout_data_dir) # validate if ( diff --git a/recipe/sppo/sppo_ray_trainer.py b/recipe/sppo/sppo_ray_trainer.py index baf1786964a..e075beec8c3 100644 --- a/recipe/sppo/sppo_ray_trainer.py +++ b/recipe/sppo/sppo_ray_trainer.py @@ -308,22 +308,7 @@ def fit(self): # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: - with simple_timer("dump_rollout_generations", timing_raw): - print(batch.batch.keys()) - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - sample_gts = [ - item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in batch - ] - self._dump_generations( - inputs=inputs, - outputs=outputs, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - gts=sample_gts, - dump_path=rollout_data_dir, - ) + self._log_rollout_data(batch, reward_extra_infos_dict, timing_raw, rollout_data_dir) # validate if ( diff --git a/verl/trainer/config/actor/actor.yaml b/verl/trainer/config/actor/actor.yaml index ff03239f5d3..3b5764e4a39 100644 --- a/verl/trainer/config/actor/actor.yaml +++ b/verl/trainer/config/actor/actor.yaml @@ -120,7 +120,7 @@ optim: # Learning rate lr: 1e-6 - # Warmup steps ratio (used if lr_warmup_steps is negative) + # Warmup steps ratio (used if lr_warmup_steps is 0 or negative) lr_warmup_steps_ratio: 0.0 # Total training steps (must be overridden at runtime) diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py index bb945c0451f..0e3b1b77c5f 100644 --- a/verl/trainer/ppo/ray_trainer.py +++ b/verl/trainer/ppo/ray_trainer.py @@ -441,6 +441,38 @@ def _dump_generations(self, inputs, outputs, gts, scores, reward_extra_infos_dic print(f"Dumped generations to {filename}") + def _log_rollout_data( + self, batch: DataProto, reward_extra_infos_dict: dict, timing_raw: dict, rollout_data_dir: str + ): + """Log rollout data to disk. + Args: + batch (DataProto): The batch containing rollout data + reward_extra_infos_dict (dict): Additional reward information to log + timing_raw (dict): Timing information for profiling + rollout_data_dir (str): Directory path to save the rollout data + """ + with marked_timer("dump_rollout_generations", timing_raw, color="green"): + inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) + outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) + scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() + sample_gts = [item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) for item in batch] + + reward_extra_infos_to_dump = reward_extra_infos_dict.copy() + if "request_id" in batch.non_tensor_batch: + reward_extra_infos_dict.setdefault( + "request_id", + batch.non_tensor_batch["request_id"].tolist(), + ) + + self._dump_generations( + inputs=inputs, + outputs=outputs, + gts=sample_gts, + scores=scores, + reward_extra_infos_dict=reward_extra_infos_to_dump, + dump_path=rollout_data_dir, + ) + def _maybe_log_val_generations(self, inputs, outputs, scores): """Log a table of validation samples to the configured logger (wandb or swanlab)""" @@ -1111,29 +1143,7 @@ def fit(self): # Log rollout generations if enabled rollout_data_dir = self.config.trainer.get("rollout_data_dir", None) if rollout_data_dir: - with marked_timer("dump_rollout_generations", timing_raw, color="green"): - inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True) - outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True) - scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist() - sample_gts = [ - item.non_tensor_batch.get("reward_model", {}).get("ground_truth", None) - for item in batch - ] - - if "request_id" in batch.non_tensor_batch: - reward_extra_infos_dict.setdefault( - "request_id", - batch.non_tensor_batch["request_id"].tolist(), - ) - - self._dump_generations( - inputs=inputs, - outputs=outputs, - gts=sample_gts, - scores=scores, - reward_extra_infos_dict=reward_extra_infos_dict, - dump_path=rollout_data_dir, - ) + self._log_rollout_data(batch, reward_extra_infos_dict, timing_raw, rollout_data_dir) # validate if ( diff --git a/verl/utils/megatron/optimizer.py b/verl/utils/megatron/optimizer.py index 18799d5cf8b..4647e11257c 100644 --- a/verl/utils/megatron/optimizer.py +++ b/verl/utils/megatron/optimizer.py @@ -23,11 +23,11 @@ def init_megatron_optim_config(optim_config: dict) -> OptimizerConfig: optim_args = { - "optimizer": optim_config.get("optimizer", "adam"), - "lr": optim_config.get("lr"), - "min_lr": optim_config.get("min_lr", None), - "clip_grad": optim_config.get("clip_grad", 1.0), - "weight_decay": optim_config.get("weight_decay", 0.01), + "optimizer": optim_config.optimizer, + "lr": optim_config.lr, + "min_lr": optim_config.min_lr, + "clip_grad": optim_config.clip_grad, + "weight_decay": optim_config.weight_decay, "bf16": True, "params_dtype": torch.bfloat16, "use_distributed_optimizer": True, diff --git a/verl/workers/engine/fsdp/transformer_impl.py b/verl/workers/engine/fsdp/transformer_impl.py index 23040bf2475..2b5bcb393b1 100644 --- a/verl/workers/engine/fsdp/transformer_impl.py +++ b/verl/workers/engine/fsdp/transformer_impl.py @@ -375,13 +375,13 @@ def _build_lr_scheduler(self, optimizer): optim_config = self.optimizer_config - total_steps = optim_config.get("total_training_steps", 0) - num_warmup_steps = int(optim_config.get("lr_warmup_steps", -1)) - warmup_style = optim_config.get("warmup_style", "constant") - min_lr_ratio = optim_config.get("min_lr_ratio", 0.0) - num_cycles = optim_config.get("num_cycles", 0.5) - if num_warmup_steps < 0: - num_warmup_steps_ratio = optim_config.get("lr_warmup_steps_ratio", 0.0) + total_steps = optim_config.total_training_steps + num_warmup_steps = optim_config.lr_warmup_steps + warmup_style = optim_config.warmup_style + min_lr_ratio = optim_config.min_lr_ratio + num_cycles = optim_config.num_cycles + if num_warmup_steps <= 0: + num_warmup_steps_ratio = optim_config.lr_warmup_steps_ratio num_warmup_steps = int(num_warmup_steps_ratio * total_steps) if self.rank == 0: