full propagation

wseaton · wseaton · commit 035e54bbf5ee · 2025-10-03T21:27:47.000-04:00
Signed-off-by: Will Eaton &lt;weaton@redhat.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py
@@ -12,8 +12,7 @@
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse
 
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
+logger = logging.getLogger("uvicorn.error")
 
 
 @asynccontextmanager
@@ -180,7 +179,8 @@ async def send_request_to_service(client_info: dict, endpoint: str,
 
 
 async def stream_service_response(client_info: dict, endpoint: str,
-                                  req_data: dict, request_id: str):
+                                  req_data: dict, request_id: str,
+                                  request: Request):
     """
     Asynchronously stream response from a service using a client from the pool.
     """
@@ -189,29 +189,41 @@ async def stream_service_response(client_info: dict, endpoint: str,
         "X-Request-Id": request_id
     }
 
+    # get logger from request state for ASGI integration
+    req_logger = getattr(request.app.state, 'logger', logger)
+
     async with client_info['client'].stream("POST",
                                             endpoint,
                                             json=req_data,
                                             headers=headers) as response:
-        logger.info("Decode server response status: %s for request %s",
-                    response.status_code, request_id)
+        req_logger.info("Decode server response status: %s for request %s",
+                        response.status_code, request_id)
 
         # handle error responses with context
         if response.status_code >= 400:
             error_body = await response.aread()
             try:
                 import json
                 error_data = json.loads(error_body)
-                logger.error(
-                    "Decode server error %d for request %s: %s. " \
-                    "Error context: %s",
-                    response.status_code, request_id,
-                    error_data.get('message', 'no message'),
-                    error_data.get('error_context', 'no context'))
+                error_ctx = error_data.get('error', {}).get('error_context')
+                if error_ctx:
+                    req_logger.error(
+                        "Decode server error %d for request %s: %s. "
+                        "Error type: %s, Metadata: %s", response.status_code,
+                        request_id,
+                        error_data.get('error',
+                                       {}).get('message', 'no message'),
+                        error_ctx.get('error_type'), error_ctx.get('metadata'))
+                else:
+                    req_logger.error(
+                        "Decode server error %d for request %s: %s",
+                        response.status_code, request_id,
+                        error_data.get('error',
+                                       {}).get('message', 'no message'))
             except json.JSONDecodeError:
-                logger.error("Decode server error %d for request %s: %s",
-                             response.status_code, request_id,
-                             error_body.decode('utf-8'))
+                req_logger.error("Decode server error %d for request %s: %s",
+                                 response.status_code, request_id,
+                                 error_body.decode('utf-8'))
             response.raise_for_status()
 
         async for chunk in response.aiter_bytes():
@@ -247,7 +259,8 @@ async def generate_stream():
             async for chunk in stream_service_response(decode_client_info,
                                                        api,
                                                        req_data,
-                                                       request_id=request_id):
+                                                       request_id=request_id,
+                                                       request=request):
                 chunk_count += 1
                 # parse SSE data to log key fields
                 chunk_str = chunk.decode('utf-8')
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -115,6 +115,7 @@ class ErrorInfo(OpenAIBaseModel):
     type: str
     param: Optional[str] = None
     code: int
+    error_context: Optional[dict[str, Any]] = None
 
 
 class ErrorResponse(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -585,7 +585,8 @@ async def chat_completion_stream_generator(
                                 error_ctx.message,
                                 err_type=error_ctx.error_type,
                                 status_code=error_ctx.http_status
-                                or HTTPStatus.INTERNAL_SERVER_ERROR)
+                                or HTTPStatus.INTERNAL_SERVER_ERROR,
+                                error_context=res.error_context)
                         else:
                             yield self.create_streaming_error_response(
                                 "Request aborted due to an internal error.",
@@ -1197,7 +1198,8 @@ async def chat_completion_full_generator(
                                 error_ctx.message,
                                 err_type=error_ctx.error_type,
                                 status_code=error_ctx.http_status
-                                or HTTPStatus.INTERNAL_SERVER_ERROR)
+                                or HTTPStatus.INTERNAL_SERVER_ERROR,
+                                error_context=res.error_context)
                         else:
                             return self.create_error_response(
                                 "Request aborted due to an internal error.",
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -282,7 +282,8 @@ async def create_completion(
                                 error_ctx.message,
                                 err_type=error_ctx.error_type,
                                 status_code=error_ctx.http_status
-                                or HTTPStatus.INTERNAL_SERVER_ERROR)
+                                or HTTPStatus.INTERNAL_SERVER_ERROR,
+                                error_context=final_res.error_context)
                         else:
                             return self.create_error_response(
                                 "Request aborted due to an internal error.",
@@ -372,7 +373,8 @@ async def completion_stream_generator(
                                 error_ctx.message,
                                 err_type=error_ctx.error_type,
                                 status_code=error_ctx.http_status
-                                or HTTPStatus.INTERNAL_SERVER_ERROR)
+                                or HTTPStatus.INTERNAL_SERVER_ERROR,
+                                error_context=res.error_context)
                         else:
                             yield self.create_streaming_error_response(
                                 "Request aborted due to an internal error.",
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -441,26 +441,32 @@ def create_error_response(
         message: str,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        error_context: Optional[dict[str, Any]] = None,
     ) -> ErrorResponse:
         if self.log_error_stack:
             exc_type, _, _ = sys.exc_info()
             if exc_type is not None:
                 traceback.print_exc()
             else:
                 traceback.print_stack()
-        return ErrorResponse(error=ErrorInfo(
-            message=message, type=err_type, code=status_code.value))
+        return ErrorResponse(error=ErrorInfo(message=message,
+                                             type=err_type,
+                                             code=status_code.value,
+                                             error_context=error_context))
 
     def create_streaming_error_response(
         self,
         message: str,
         err_type: str = "BadRequestError",
         status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+        error_context: Optional[dict[str, Any]] = None,
     ) -> str:
         json_str = json.dumps(
-            self.create_error_response(message=message,
-                                       err_type=err_type,
-                                       status_code=status_code).model_dump())
+            self.create_error_response(
+                message=message,
+                err_type=err_type,
+                status_code=status_code,
+                error_context=error_context).model_dump())
         return json_str
 
     async def _check_model(