MahmoudAshraf97
diff --git a/‎python/sglang/srt/disaggregation/decode.py‎
Lines changed: 2 additions & 8 deletions b/‎python/sglang/srt/disaggregation/decode.py‎
Lines changed: 2 additions & 8 deletions
diff --git a/‎python/sglang/srt/disaggregation/prefill.py‎
Lines changed: 2 additions & 6 deletions b/‎python/sglang/srt/disaggregation/prefill.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎python/sglang/srt/entrypoints/engine.py‎
Lines changed: 11 additions & 17 deletions b/‎python/sglang/srt/entrypoints/engine.py‎
Lines changed: 11 additions & 17 deletions
diff --git a/‎python/sglang/srt/entrypoints/http_server.py‎
Lines changed: 10 additions & 2 deletions b/‎python/sglang/srt/entrypoints/http_server.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎python/sglang/srt/managers/io_struct.py‎
Lines changed: 0 additions & 2 deletions b/‎python/sglang/srt/managers/io_struct.py‎
Lines changed: 0 additions & 2 deletions
@@ -694,10 +694,7 @@ def event_loop_normal_disagg_decode(self: Scheduler):
                 + len(self.disagg_decode_prealloc_queue.queue)
                 == 0
             ):
-                # When the server is idle, do self-check and re-init some states
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-                self.maybe_sleep_on_idle()
+                self.self_check_during_idle()
 
             self.last_batch = batch
 
@@ -771,10 +768,7 @@ def event_loop_overlap_disagg_decode(self: Scheduler):
                 + len(self.disagg_decode_prealloc_queue.queue)
                 == 0
             ):
-                # When the server is idle, do self-check and re-init some states
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-                self.maybe_sleep_on_idle()
+                self.self_check_during_idle()
 
             self.last_batch = batch
             self.last_batch_in_queue = last_batch_in_queue
 
@@ -287,9 +287,7 @@ def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
                 self.process_disagg_prefill_inflight_queue()
 
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-                self.maybe_sleep_on_idle()
+                self.self_check_during_idle()
 
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -337,9 +335,7 @@ def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
                 self.process_disagg_prefill_inflight_queue()
 
             if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
-                self.check_memory()
-                self.new_token_ratio = self.init_new_token_ratio
-                self.maybe_sleep_on_idle()
+                self.self_check_during_idle()
 
             self.last_batch = batch
             # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
 
@@ -652,25 +652,19 @@ def _set_envs_and_config(server_args: ServerArgs):
             "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
         )
 
-    def sigchld_handler(signum, frame):
-        pid, exitcode = os.waitpid(0, os.WNOHANG)
-        if exitcode != 0:
-            logger.warning(
-                f"Child process unexpectedly failed with {exitcode=}. {pid=}"
+    if True:  # Keep this check for internal code compatibility
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when any error happens
+        # This process then clean up the whole process tree
+        # Note: This sigquit handler is used in the launch phase, and may be replaced by
+        # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+        def launch_phase_sigquit_handler(signum, frame):
+            logger.error(
+                "Received sigquit from a child process. It usually means the child failed."
             )
+            kill_process_tree(os.getpid())
 
-    signal.signal(signal.SIGCHLD, sigchld_handler)
-
-    # Register the signal handler.
-    # The child processes will send SIGQUIT to this process when any error happens
-    # This process then clean up the whole process tree
-    def sigquit_handler(signum, frame):
-        logger.error(
-            "Received sigquit from a child process. It usually means the child failed."
-        )
-        kill_process_tree(os.getpid())
-
-    signal.signal(signal.SIGQUIT, sigquit_handler)
+        signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
 
     # Set mp start method
     mp.set_start_method("spawn", force=True)
 
@@ -238,6 +238,9 @@ async def health() -> Response:
 @app.get("/health_generate")
 async def health_generate(request: Request) -> Response:
     """Check the health of the inference server by generating one token."""
+    if _global_state.tokenizer_manager.gracefully_exit:
+        logger.info("Health check request received during shutdown. Returning 503.")
+        return Response(status_code=503)
 
     sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
     rid = f"HEALTH_CHECK_{time.time()}"
@@ -260,9 +263,14 @@ async def gen():
         async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
             break
 
-    tic = time.perf_counter()
+    # This request is a special request.
+    # If the server already has something running, this request will be ignored, so it creates zero overhead.
+    # If the server is not running, this request will be run, so we know whether the server is healthy.
     task = asyncio.create_task(gen())
-    while time.perf_counter() < tic + HEALTH_CHECK_TIMEOUT:
+
+    # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
+    tic = time.time()
+    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
         await asyncio.sleep(1)
         if _global_state.tokenizer_manager.last_receive_tstamp > tic:
             task.cancel()
 
@@ -152,8 +152,6 @@ def normalize_batch_and_arguments(self):
         else:
             self._normalize_batch_inputs()
 
-        self._validate_session_params()
-
     def _validate_inputs(self):
         """Validate that the input configuration is valid."""
         if (