Modify max_tokens and modify the log

amy-why-3459 · amy-why-3459 · commit c79fb402ebbb · 2026-01-30T17:52:36.000+08:00
Signed-off-by: amy-why-3459 &lt;wuhaiyan17@huawei.com&gt;
diff --git a/vllm_omni/distributed/omni_connectors/adapter.py b/vllm_omni/distributed/omni_connectors/adapter.py
@@ -245,7 +245,7 @@ def get_through_connector(connector, target_stage_id, stage_id, req_id, connecto
             if payload_data:
                 connector.request_prompt_token_ids[req_id] = payload_data.get("thinker_input_ids", [])
                 connector.get_requests[req_id] += 1
-                logger.debug(f"[Stage-{stage_id}] Received one chunk for request {connector_get_key}")
+                logger.debug("[Stage-%d] Received one chunk for request %s", stage_id, connector_get_key)
                 break
         time.sleep(0.01)
     return payload_data
@@ -325,7 +325,7 @@ def put_chunk(
             logger.error(f"Failed to use custom_process_input_func for payload extraction: {e}")
 
         if not payload_data:
-            logger.warning(f"[Stage-{stage_id}] No payload data to send for request {request_id}")
+            logger.warning("[Stage-%d] No payload data to send for request %s", stage_id, request_id)
             return
 
         if stage_id == 0 and chunk_id == 0:
@@ -341,7 +341,7 @@ def put_chunk(
                 payload_data["thinker_hidden_states"] = torch.cat(
                     (save_payload.get("thinker_hidden_states"), payload_data.get("thinker_hidden_states")), dim=0
                 )
-                logger.info(f"[Stage-{stage_id}] Merged embeddings and hidden states for request {request_id}")
+                logger.debug("[Stage-%d] Merged embeddings and hidden states for request %s", stage_id, request_id)
 
         if stage_id == 1:
             # TODO: Make parameters configurable and optimize algorithms
@@ -367,7 +367,7 @@ def put_chunk(
 
         if success:
             connector.put_requests[request_id] += 1
-            logger.info(f"[Stage-{stage_id}] Sent {connector_put_key}")
+            logger.debug("[Stage-%d] Sent %s", stage_id, connector_put_key)
 
 
 def compute_talker_prompt_ids_length(prompt_ids: list[int]) -> int:
diff --git a/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py b/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import fcntl
 import time
 from collections import defaultdict
 from typing import Any
@@ -53,7 +54,12 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[
             # if size > self.threshold:
             if True:  # TODO: correct put & get logic
                 # Use Shared Memory
-                meta = shm_write_bytes(payload, name=put_key)
+                lock_file = f"/dev/shm/shm_{put_key}_lockfile.lock"
+                with open(lock_file, "w") as lockf:
+                    fcntl.flock(lockf, fcntl.LOCK_EX)
+                    meta = shm_write_bytes(payload, name=put_key)
+                    fcntl.flock(lockf, fcntl.LOCK_UN)
+
                 # meta contains {'name': ..., 'size': ...}
                 metadata[put_key] = {"shm": meta, "size": size}
                 self._metrics["shm_writes"] += 1
@@ -97,7 +103,11 @@ def get(self, from_stage: str, to_stage: str, get_key: str, metadata=None) -> tu
             return None, 0
 
         try:
-            data_bytes = shm_read_bytes({"name": get_key, "size": shm.size})
+            lock_file = f"/dev/shm/shm_{get_key}_lockfile.lock"
+            with open(lock_file) as lockf:
+                fcntl.flock(lockf, fcntl.LOCK_SH)
+                data_bytes = shm_read_bytes({"name": get_key, "size": shm.size})
+                fcntl.flock(lockf, fcntl.LOCK_UN)
             obj = self.deserialize_obj(data_bytes)
             return obj, shm.size
         finally:
diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk.yaml
@@ -63,7 +63,7 @@ stage_args:
     default_sampling_params:
       temperature: 0.9
       top_k: 50
-      max_tokens: 4096
+      max_tokens: 2048 # TODO: The max_tokens of the async_chunk feature cannot exceed 2048.
       seed: 42
       detokenize: False
       repetition_penalty: 1.05