[recipe] feat: Add sleep/wakeup mode for gen rm vllm service and add tqdm showing process (#2739)

none0663 · web-flow · commit 76298addd013 · 2025-07-29T13:10:20.000+08:00
### What does this PR do?

Add sleep/wakeup mode for gen rm vllm service and add tqdm showing
process.

This capability is particularly beneficial when the model server shares
resources with a training workload on the same machine. It allows the
reward model service to be temporarily offloaded (to free up GPU memory)
during intensive training sessions and reloaded when the service is
required again.
diff --git a/recipe/genrm_remote/README.md b/recipe/genrm_remote/README.md
@@ -7,8 +7,16 @@
 Deploy the pretrained GenRM model using vLLM. Skip this step if you want to use an external api service.
 
 ```bash 
-vllm serve verl-team/GenRM-CI-Test-1.5B --served-model-name genrm-demo
+VLLM_SERVER_DEV_MODE=1 vllm serve verl-team/GenRM-CI-Test-1.5B --served-model-name genrm-demo --enable-sleep-mode --dtype float32
 ```
+Note that the wake_up and sleep operations for managing CUDA memory in vLLM are only available when both `VLLM_SERVER_DEV_MODE=1` and `enable_sleep_mode` are set. This capability is particularly beneficial when the model server shares resources with a training workload on the same machine. It allows the reward model service to be temporarily offloaded (to free up GPU memory) during intensive training sessions and reloaded when the service is required again. The relevant vllm code implementation can be found below:
+
+[VLLM_SERVER_DEV_MODE](https://github.com/vllm-project/vllm/blob/5a19a6c6705fe83db2e3517a2d2f473586901743/vllm/entrypoints/openai/api_server.py#L971)
+
+[sleep and wake_up mode](https://github.com/vllm-project/vllm/blob/5a19a6c6705fe83db2e3517a2d2f473586901743/vllm/entrypoints/openai/api_server.py#L994-L1003)
+
+When the backend is configured as `SERVER_BACKEND`="VLLM", the `USE_OFFLOAD` flag can be toggled between True and False.(see `reward_function.py`)
+
 
 ### Step 2: Perform RL using GenRM
 
diff --git a/recipe/genrm_remote/reward_function.py b/recipe/genrm_remote/reward_function.py
@@ -12,13 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from concurrent.futures import ThreadPoolExecutor
+import random
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from time import sleep
 
 import requests
+import tqdm
 
 from verl.utils.reward_score.math import last_boxed_only_string, remove_boxed
 
+SERVER_BACKEND = "VLLM"
+USE_OFFLOAD = True
 BASE_URL = "http://localhost:30000"
 API_KEY = "EMPTY"
 MAX_RETRIES = 3
@@ -42,6 +47,13 @@
 """.strip()
 
 
+def vllm_execute_method(task="sleep"):
+    assert task in ["sleep", "wake_up"], f"Invalid task: {task}"
+    url_root = BASE_URL
+    response = requests.post(url_root + "/" + task)
+    assert response.status_code == 200
+
+
 def get_response(problem, solution_str, ground_truth):
     prompt = GENRM_PROMPT_TEMPLATE.format(problem=problem, solution=solution_str)
     messages = [{"role": "user", "content": prompt}]
@@ -77,14 +89,14 @@ def compute_reward(response):
     return reward_score
 
 
-def compute_score(data_source, solution_str, ground_truth, extra_info):
+def compute_score(data_source, solution_str, ground_truth, extra_info, index):
     split = extra_info["split"]
     from verl.utils.reward_score import default_compute_score
 
     func_rm_score = default_compute_score(data_source, solution_str, ground_truth, extra_info)
 
     if split == "test":
-        return func_rm_score
+        return func_rm_score, index
     else:
         problem = extra_info["question"]
         response = get_response(problem, solution_str, ground_truth)
@@ -93,18 +105,29 @@ def compute_score(data_source, solution_str, ground_truth, extra_info):
         else:
             reward_score = 0.0
 
-        return reward_score
+        return reward_score, index
 
 
 def compute_score_batch(data_sources, solution_strs, ground_truths, extra_infos):
+    results = []
+    indexes = list(range(len(data_sources)))
+    if SERVER_BACKEND == "VLLM" and USE_OFFLOAD:
+        vllm_execute_method("wake_up")
+
     with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
         futures = []
-        for data_source, solution_str, ground_truth, extra_info in zip(
-            data_sources, solution_strs, ground_truths, extra_infos, strict=True
+        for data_source, solution_str, ground_truth, extra_info, index in zip(
+            data_sources, solution_strs, ground_truths, extra_infos, indexes, strict=True
         ):
-            future = executor.submit(compute_score, data_source, solution_str, ground_truth, extra_info)
+            future = executor.submit(compute_score, data_source, solution_str, ground_truth, extra_info, index)
+            time.sleep(0.001 * random.random())
             futures.append(future)
 
-        results = [future.result() for future in futures]
+        for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
+            results.append(future.result())
+        results = sorted(results, key=lambda x: x[-1], reverse=False)
+        results = [result[0] for result in results]
 
+    if SERVER_BACKEND == "VLLM" and USE_OFFLOAD:
+        vllm_execute_method("sleep")
     return results