Merge pull request #4 from Dstack-TEE/feat/add_proxy_metrics

mondaylord · web-flow · commit 66659aa5642f · 2025-12-24T15:08:47.000+08:00
feat: add proxy metrics
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,6 +17,8 @@ dstack-sdk = "^0.5.0"
 cryptography = "^43.0.1"
 redis = "^5.2.1"
 nv-ppcie-verifier = "^1.5.0"
+prometheus-fastapi-instrumentator = "^7.0.0"
+prometheus-client = "^0.21.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"
diff --git a/src/app/api/v1/openai.py b/src/app/api/v1/openai.py
@@ -20,6 +20,7 @@
 )
 from app.cache.cache import cache
 from app.logger import log
+from app.metrics import get_proxy_metrics
 from app.quote.quote import (
     ECDSA,
     ED25519,
@@ -65,7 +66,6 @@ async def stream_vllm_response(
     request_body: bytes,
     modified_request_body: bytes,
     request_hash: Optional[str] = None,
-    requested_model: Optional[str] = None,
 ):
     """
     Handle streaming vllm request
@@ -75,7 +75,6 @@ async def stream_vllm_response(
         request_hash: Optional hash from request header (X-Request-Hash). Used by trusted clients to provide
                      pre-calculated request hash, avoiding redundant hash computation. Falls back to
                      calculating hash from request_body if not provided
-        requested_model: The model name requested by the client
     Returns:
         A streaming response
     """
@@ -103,11 +102,6 @@ async def generate_stream(response):
                         # Extract the cache key (data.id) from the first chunk
                         if not chat_id:
                             chat_id = chunk_data.get("id")
-                        
-                        # Override the model name if requested_model is provided
-                        if requested_model and "model" in chunk_data:
-                            chunk_data["model"] = requested_model
-                            final_chunk = f"data: {json.dumps(chunk_data)}\n"
                             
                     except Exception as e:
                         error_message = f"Failed to parse chunk: {e}\n The original data is: {data}"
@@ -148,6 +142,7 @@ async def generate_stream(response):
         generate_stream(response),
         background=BackgroundTasks([response.aclose, client.aclose]),
         media_type="text/event-stream",
+        headers={"X-Accel-Buffering": "no"},
     )
 
 
@@ -157,7 +152,6 @@ async def non_stream_vllm_response(
     request_body: bytes,
     modified_request_body: bytes,
     request_hash: Optional[str] = None,
-    requested_model: Optional[str] = None,
 ):
     """
     Handle non-streaming responses
@@ -167,7 +161,6 @@ async def non_stream_vllm_response(
         request_hash: Optional hash from request header (X-Request-Hash). Used by trusted clients to provide
                      pre-calculated request hash, avoiding redundant hash computation. Falls back to
                      calculating hash from request_body if not provided
-        requested_model: The model name requested by the client
     Returns:
         The response data
     """
@@ -186,10 +179,6 @@ async def non_stream_vllm_response(
             raise HTTPException(status_code=response.status_code, detail=response.text)
 
         response_data = response.json()
-        
-        # Override the model name if requested_model is provided
-        if requested_model and "model" in response_data:
-            response_data["model"] = requested_model
             
         # Cache the request-response pair using the chat ID
         chat_id = response_data.get("id")
@@ -270,18 +259,16 @@ async def chat_completions(
     is_stream = modified_json.get(
         "stream", False
     )  # Default to non-streaming if not specified
-    requested_model = modified_json.get("model")
-
     modified_request_body = json.dumps(modified_json).encode("utf-8")
     if is_stream:
         # Create a streaming response
         return await stream_vllm_response(
-            VLLM_URL, request_body, modified_request_body, x_request_hash, requested_model
+            VLLM_URL, request_body, modified_request_body, x_request_hash
         )
     else:
         # Handle non-streaming response
         response_data = await non_stream_vllm_response(
-            VLLM_URL, request_body, modified_request_body, x_request_hash, requested_model
+            VLLM_URL, request_body, modified_request_body, x_request_hash
         )
         return JSONResponse(content=response_data)
 
@@ -301,18 +288,16 @@ async def completions(
     is_stream = modified_json.get(
         "stream", False
     )  # Default to non-streaming if not specified
-    requested_model = modified_json.get("model")
-
     modified_request_body = json.dumps(modified_json).encode("utf-8")
     if is_stream:
         # Create a streaming response
         return await stream_vllm_response(
-            VLLM_COMPLETIONS_URL, request_body, modified_request_body, x_request_hash, requested_model
+            VLLM_COMPLETIONS_URL, request_body, modified_request_body, x_request_hash
         )
     else:
         # Handle non-streaming response
         response_data = await non_stream_vllm_response(
-            VLLM_COMPLETIONS_URL, request_body, modified_request_body, x_request_hash, requested_model
+            VLLM_COMPLETIONS_URL, request_body, modified_request_body, x_request_hash
         )
         return JSONResponse(content=response_data)
 
@@ -355,11 +340,25 @@ async def signature(request: Request, chat_id: str, signing_algo: str = None):
 # Metrics of vLLM instance
 @router.get("/metrics")
 async def metrics(request: Request):
-    async with httpx.AsyncClient(timeout=httpx.Timeout(TIMEOUT)) as client:
-        response = await client.get(VLLM_METRICS_URL)
-        if response.status_code != 200:
-            raise HTTPException(status_code=response.status_code, detail=response.text)
-        return PlainTextResponse(response.text)
+    # Get local metrics from the proxy
+    local_metrics = get_proxy_metrics()
+    
+    # Fetch metrics from the vLLM backend
+    try:
+        async with httpx.AsyncClient(timeout=httpx.Timeout(TIMEOUT)) as client:
+            response = await client.get(VLLM_METRICS_URL)
+            if response.status_code == 200:
+                remote_metrics = response.text
+            else:
+                log.warning(f"Failed to fetch vLLM metrics: {response.status_code}")
+                remote_metrics = f"# Failed to fetch vLLM metrics: {response.status_code}"
+    except Exception as e:
+        log.error(f"Error fetching vLLM metrics: {e}")
+        remote_metrics = f"# Error fetching vLLM metrics: {e}"
+        
+    # Combine both and return
+    combined_metrics = f"{local_metrics}\n\n# --- vLLM Backend Metrics ---\n\n{remote_metrics}"
+    return PlainTextResponse(combined_metrics)
 
 
 @router.get("/models")
diff --git a/src/app/main.py b/src/app/main.py
@@ -3,10 +3,15 @@
 from .api import router as api_router
 from .api.response.response import ok, error, http_exception
 from .logger import log
+from .metrics import vllm_proxy_errors_total
+from prometheus_fastapi_instrumentator import Instrumentator
 
 app = FastAPI()
 app.include_router(api_router)
 
+# Initialize Prometheus Instrumentator
+Instrumentator().instrument(app).expose(app, endpoint="/local-metrics", include_in_schema=False)
+
 
 @app.get("/")
 async def root():
@@ -25,6 +30,7 @@ async def global_exception_handler(request: Request, exc: Exception):
         return http_exception(exc.status_code, exc.detail)
 
     log.error(f"Unhandled exception: {exc}")
+    vllm_proxy_errors_total.labels(type=type(exc).__name__).inc()
     return error(
         status_code=500,
         message=str(exc),
diff --git a/src/app/metrics.py b/src/app/metrics.py
@@ -0,0 +1,16 @@
+from prometheus_client import Counter, REGISTRY, generate_latest
+
+# Custom metrics
+# By default, metrics are registered to REGISTRY
+vllm_proxy_errors_total = Counter(
+    "vllm_proxy_errors_total",
+    "Total number of unhandled exceptions in the vLLM proxy",
+    ["type"]
+)
+
+def get_proxy_metrics() -> str:
+    """
+    Get the current proxy metrics from the prometheus-client registry.
+    """
+    return generate_latest(REGISTRY).decode("utf-8")
+
diff --git a/tests/app/test_metrics.py b/tests/app/test_metrics.py
@@ -0,0 +1,59 @@
+import pytest
+import httpx
+from fastapi.testclient import TestClient
+from unittest.mock import patch
+
+# Standard test setup
+from tests.app.test_helpers import setup_test_environment, TEST_AUTH_HEADER
+setup_test_environment()
+
+import sys
+sys.modules["app.quote.quote"] = __import__("tests.app.mock_quote", fromlist=[""])
+
+from app.main import app
+from app.api.v1.openai import VLLM_METRICS_URL
+
+client = TestClient(app)
+
+@pytest.mark.asyncio
+@pytest.mark.respx
+async def test_metrics_endpoint_combined(respx_mock):
+    # Mock the vLLM metrics endpoint
+    vllm_metrics_content = "# HELP vllm_some_metric\n# TYPE vllm_some_metric counter\nvllm_some_metric 1.0"
+    respx_mock.get(VLLM_METRICS_URL).mock(
+        return_value=httpx.Response(200, text=vllm_metrics_content)
+    )
+
+    # Make request to the proxy's metrics endpoint
+    response = client.get("/v1/metrics")
+    
+    assert response.status_code == 200
+    content = response.text
+    
+    # Check if local metrics are present (e.g., from prometheus-fastapi-instrumentator or our custom ones)
+    assert "vllm_proxy_errors_total" in content
+    assert "http_requests_total" in content
+    
+    # Check if vLLM metrics are present
+    assert "vllm_some_metric" in content
+    assert "vLLM Backend Metrics" in content
+
+@pytest.mark.asyncio
+@pytest.mark.respx
+async def test_metrics_endpoint_vllm_fail(respx_mock):
+    # Mock the vLLM metrics endpoint to fail
+    respx_mock.get(VLLM_METRICS_URL).mock(
+        return_value=httpx.Response(500, text="Internal Server Error")
+    )
+
+    # Make request to the proxy's metrics endpoint
+    response = client.get("/v1/metrics")
+    
+    assert response.status_code == 200
+    content = response.text
+    
+    # Local metrics should still be there
+    assert "vllm_proxy_errors_total" in content
+    
+    # Should contain error message about vLLM metrics
+    assert "Failed to fetch vLLM metrics: 500" in content