hiyouga
diff --git a/‎Dockerfile‎
Lines changed: 6 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 7 deletions b/‎README.md‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎examples/grpo_example.yaml‎
Lines changed: 15 additions & 10 deletions b/‎examples/grpo_example.yaml‎
Lines changed: 15 additions & 10 deletions
diff --git a/‎examples/remax_example.yaml‎
Lines changed: 16 additions & 11 deletions b/‎examples/remax_example.yaml‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎examples/run_qwen2_5_7b_math.sh‎
Lines changed: 4 additions & 0 deletions b/‎examples/run_qwen2_5_7b_math.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/run_qwen2_5_7b_math_swanlab.sh‎
Lines changed: 4 additions & 0 deletions b/‎examples/run_qwen2_5_7b_math_swanlab.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/run_qwen2_5_vl_2b_clevr.sh‎ renamed to ‎examples/run_qwen2_5_vl_3b_clevr.sh‎ b/‎examples/run_qwen2_5_vl_2b_clevr.sh‎ renamed to ‎examples/run_qwen2_5_vl_3b_clevr.sh‎
diff --git a/‎examples/run_qwen2_5_vl_3b_geo.sh‎
Lines changed: 4 additions & 0 deletions b/‎examples/run_qwen2_5_vl_3b_geo.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/run_qwen2_5_vl_7b_geo.sh‎
Lines changed: 5 additions & 1 deletion b/‎examples/run_qwen2_5_vl_7b_geo.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/run_qwen2_5_vl_7b_geo_swanlab.sh‎
Lines changed: 5 additions & 1 deletion b/‎examples/run_qwen2_5_vl_7b_geo_swanlab.sh‎
Lines changed: 5 additions & 1 deletion
@@ -37,11 +37,15 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     python -m pip install --upgrade pip
 
 # Install torch-2.5.1 + vllm-0.7.3
-RUN pip install --no-cache-dir vllm==0.7.3 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 tensordict \
+RUN pip install --no-cache-dir vllm==0.7.3 torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 tensordict torchdata \
     transformers>=4.49.0 accelerate datasets peft \
-    ray codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils
+    ray codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
 
 # Install flash_attn-2.7.4.post1
 RUN pip uninstall -y transformer-engine flash-attn && \
     wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
     pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+
+# Fix cv2
+RUN pip uninstall -y pynvml nvidia-ml-py && \
+    pip install nvidia-ml-py>=12.560.30 opencv-python-headless==4.11.0.86 fastapi==0.115.6
@@ -29,6 +29,12 @@ EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://a
 
 We provide a [Dockerfile](./Dockerfile) to easily build environments.
 
+Use [pre-built docker image](https://hub.docker.com/r/hiyouga/verl):
+
+```bash
+docker pull hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.3-rc1
+```
+
 ### Hardware Requirements
 
 \* *estimated*
@@ -71,30 +77,26 @@ python3 scripts/model_merger.py --local_dir path_to_your_last_actor_checkpoint
 
 ## Custom Dataset
 
-The dataset should strictly follow the example data format.
+Please refer to the example datasets to prepare your own dataset.
 
 - Text dataset: https://huggingface.co/datasets/hiyouga/math12k
-    - Required columns: problem, answer
-
 - Vision-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
-    - Required columns: images, problem, answer
 
 ## Other Baselines
 
-- [CLEVR-70k-Counting](examples/run_qwen2_5_vl_2b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
+- [CLEVR-70k-Counting](examples/run_qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
 
 ## TODO
 
 - Support PPO, Reinforce++ and RLOO for VLMs.
-- Support padding-free training for VLMs.
 - Support ulysses parallelism for VLMs.
 - Support more VLM architectures.
 
 ### Known bugs
 
 These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
 
-- Vision language models are not compatible with padding-free training and ulysses parallelism yet.
+- Vision language models are not compatible with ulysses parallelism yet.
 - Vision language models are not compatible with `enable_chunked_prefill` unless [vLLM v1](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) is supported.
 
 ## Discussion Group
 
@@ -2,6 +2,8 @@ data:
   train_files: hiyouga/math12k@train
   val_files: hiyouga/math12k@test
   prompt_key: problem
+  answer_key: answer
+  image_key: images
   max_prompt_length: 1024
   max_response_length: 1024
   rollout_batch_size: 512
@@ -17,36 +19,38 @@ algorithm:
 worker:
   actor:
     global_batch_size: 128
-    micro_batch_size_per_device_for_update: 1
-    micro_batch_size_per_device_for_experience: 2
+    micro_batch_size_per_device_for_update: 4
+    micro_batch_size_per_device_for_experience: 16
     max_grad_norm: 1.0
     use_kl_loss: true
     kl_loss_coef: 1.0e-3
     kl_loss_type: low_var_kl
+    padding_free: true
+    ulysses_sequence_parallel_size: 1
     model:
       model_path: Qwen/Qwen2.5-7B-Instruct
       enable_gradient_checkpointing: true
     optim:
       lr: 1.0e-6
       weight_decay: 1.0e-2
     fsdp:
-      param_offload: false
-      optimizer_offload: false
-      torch_dtype: null
+      enable_full_shard: true
+      enable_cpu_offload: false
+      enable_rank0_init: false
     offload:
-      param_offload: true
-      optimizer_offload: true
+      offload_params: false
+      offload_optimizer: false
 
   rollout:
     temperature: 1.0
     tensor_parallel_size: 2
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.5
     n: 5
     enable_chunked_prefill: true
 
   ref:
     offload:
-      param_offload: true
+      offload_params: true
 
   reward:
     reward_type: function
@@ -60,7 +64,8 @@ trainer:
   n_gpus_per_node: 8
   nnodes: 1
   save_freq: 5
-  test_freq: 5
+  val_freq: 5
   val_before_train: true
   val_only: false
+  val_generations_to_log_to_wandb: 1
   save_checkpoint_path: null
@@ -2,6 +2,8 @@ data:
   train_files: hiyouga/math12k@train
   val_files: hiyouga/math12k@test
   prompt_key: problem
+  answer_key: answer
+  image_key: images
   max_prompt_length: 1024
   max_response_length: 1024
   rollout_batch_size: 512
@@ -17,36 +19,38 @@ algorithm:
 worker:
   actor:
     global_batch_size: 128
-    micro_batch_size_per_device_for_update: 1
-    micro_batch_size_per_device_for_experience: 2
+    micro_batch_size_per_device_for_update: 4
+    micro_batch_size_per_device_for_experience: 16
     max_grad_norm: 1.0
     use_kl_loss: true
     kl_loss_coef: 1.0e-3
     kl_loss_type: low_var_kl
+    padding_free: true
+    ulysses_sequence_parallel_size: 1
     model:
       model_path: Qwen/Qwen2.5-7B-Instruct
       enable_gradient_checkpointing: true
     optim:
       lr: 1.0e-6
       weight_decay: 1.0e-2
     fsdp:
-      param_offload: false
-      optimizer_offload: false
-      torch_dtype: null
+      enable_full_shard: true
+      enable_cpu_offload: false
+      enable_rank0_init: false
     offload:
-      param_offload: true
-      optimizer_offload: true
+      offload_params: false
+      offload_optimizer: false
 
   rollout:
     temperature: 1.0
     tensor_parallel_size: 2
-    gpu_memory_utilization: 0.6
+    gpu_memory_utilization: 0.5
     n: 5
     enable_chunked_prefill: true
 
   ref:
     offload:
-      param_offload: true
+      offload_params: true
 
   reward:
     reward_type: function
@@ -56,11 +60,12 @@ trainer:
   total_episodes: 15
   logger: ["console", "wandb"]
   project_name: easy_r1
-  experiment_name: qwen2_5_7b_remax_math
+  experiment_name: qwen2_5_7b_math
   n_gpus_per_node: 8
   nnodes: 1
   save_freq: 5
-  test_freq: 5
+  val_freq: 5
   val_before_train: true
   val_only: false
+  val_generations_to_log_to_wandb: 1
   save_checkpoint_path: null
@@ -4,7 +4,11 @@ export VLLM_ATTENTION_BACKEND=XFORMERS
 
 MODEL_PATH=Qwen/Qwen2.5-7B-Instruct  # replace it with your local file path
 
+SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
+ The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."""
+
 python3 -m verl.trainer.main \
     config=examples/grpo_example.yaml \
+    data.system_prompt="${SYSTEM_PROMPT}" \
     worker.actor.model.model_path=${MODEL_PATH} \
     trainer.n_gpus_per_node=4
@@ -4,8 +4,12 @@ export VLLM_ATTENTION_BACKEND=XFORMERS
 
 MODEL_PATH=Qwen/Qwen2.5-7B-Instruct  # replace it with your local file path
 
+SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
+ The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."""
+
 python3 -m verl.trainer.main \
     config=examples/grpo_example.yaml \
+    data.system_prompt="${SYSTEM_PROMPT}" \
     worker.actor.model.model_path=${MODEL_PATH} \
     trainer.logger=['console','swanlab'] \
     trainer.n_gpus_per_node=4
@@ -4,10 +4,14 @@ export VLLM_ATTENTION_BACKEND=XFORMERS
 
 MODEL_PATH=Qwen/Qwen2.5-VL-3B-Instruct  # replace it with your local file path
 
+SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
+ The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."""
+
 python3 -m verl.trainer.main \
     config=examples/grpo_example.yaml \
     data.train_files=hiyouga/geometry3k@train \
     data.val_files=hiyouga/geometry3k@test \
+    data.system_prompt="${SYSTEM_PROMPT}" \
     worker.actor.model.model_path=${MODEL_PATH} \
     worker.rollout.tensor_parallel_size=1 \
     worker.rollout.enable_chunked_prefill=false \
 
@@ -4,11 +4,15 @@ export VLLM_ATTENTION_BACKEND=XFORMERS
 
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 
+SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
+ The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."""
+
 python3 -m verl.trainer.main \
     config=examples/grpo_example.yaml \
     data.train_files=hiyouga/geometry3k@train \
     data.val_files=hiyouga/geometry3k@test \
+    data.system_prompt="${SYSTEM_PROMPT}" \
     worker.actor.model.model_path=${MODEL_PATH} \
     worker.rollout.enable_chunked_prefill=false \
     trainer.experiment_name=qwen2_5_vl_7b_geo \
-    trainer.n_gpus_per_node=4
+    trainer.n_gpus_per_node=8
@@ -4,12 +4,16 @@ export VLLM_ATTENTION_BACKEND=XFORMERS
 
 MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct  # replace it with your local file path
 
+SYSTEM_PROMPT="""You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
+ The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."""
+
 python3 -m verl.trainer.main \
     config=examples/grpo_example.yaml \
     data.train_files=hiyouga/geometry3k@train \
     data.val_files=hiyouga/geometry3k@test \
+    data.system_prompt="${SYSTEM_PROMPT}" \
     worker.actor.model.model_path=${MODEL_PATH} \
     worker.rollout.enable_chunked_prefill=false \
     trainer.experiment_name=qwen2_5_vl_7b_geo \
     trainer.logger=['console','swanlab'] \
-    trainer.n_gpus_per_node=4
+    trainer.n_gpus_per_node=8