8080from vllm .entrypoints .openai .serving_transcription import (
8181 OpenAIServingTranscription )
8282from vllm .entrypoints .openai .tool_parsers import ToolParserManager
83- from vllm .entrypoints .utils import with_cancellation
83+ from vllm .entrypoints .utils import load_aware_call , with_cancellation
8484from vllm .logger import init_logger
8585from vllm .usage .usage_lib import UsageContext
8686from vllm .utils import (FlexibleArgumentParser , get_open_zmq_ipc_path ,
@@ -347,6 +347,24 @@ async def health(raw_request: Request) -> Response:
347347 return Response (status_code = 200 )
348348
349349
350+ @router .get ("/load" )
351+ async def get_server_load_metrics (request : Request ):
352+ # This endpoint returns the current server load metrics.
353+ # It tracks requests utilizing the GPU from the following routes:
354+ # - /v1/chat/completions
355+ # - /v1/completions
356+ # - /v1/audio/transcriptions
357+ # - /v1/embeddings
358+ # - /pooling
359+ # - /score
360+ # - /v1/score
361+ # - /rerank
362+ # - /v1/rerank
363+ # - /v2/rerank
364+ return JSONResponse (
365+ content = {'server_load' : request .app .state .server_load_metrics })
366+
367+
350368@router .api_route ("/ping" , methods = ["GET" , "POST" ])
351369async def ping (raw_request : Request ) -> Response :
352370 """Ping check. Endpoint required for SageMaker"""
@@ -400,6 +418,7 @@ async def show_version():
400418@router .post ("/v1/chat/completions" ,
401419 dependencies = [Depends (validate_json_request )])
402420@with_cancellation
421+ @load_aware_call
403422async def create_chat_completion (request : ChatCompletionRequest ,
404423 raw_request : Request ):
405424 handler = chat (raw_request )
@@ -421,6 +440,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
421440
422441@router .post ("/v1/completions" , dependencies = [Depends (validate_json_request )])
423442@with_cancellation
443+ @load_aware_call
424444async def create_completion (request : CompletionRequest , raw_request : Request ):
425445 handler = completion (raw_request )
426446 if handler is None :
@@ -439,6 +459,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
439459
440460@router .post ("/v1/embeddings" , dependencies = [Depends (validate_json_request )])
441461@with_cancellation
462+ @load_aware_call
442463async def create_embedding (request : EmbeddingRequest , raw_request : Request ):
443464 handler = embedding (raw_request )
444465 if handler is None :
@@ -485,6 +506,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
485506
486507@router .post ("/pooling" , dependencies = [Depends (validate_json_request )])
487508@with_cancellation
509+ @load_aware_call
488510async def create_pooling (request : PoolingRequest , raw_request : Request ):
489511 handler = pooling (raw_request )
490512 if handler is None :
@@ -503,6 +525,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
503525
504526@router .post ("/score" , dependencies = [Depends (validate_json_request )])
505527@with_cancellation
528+ @load_aware_call
506529async def create_score (request : ScoreRequest , raw_request : Request ):
507530 handler = score (raw_request )
508531 if handler is None :
@@ -521,6 +544,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
521544
522545@router .post ("/v1/score" , dependencies = [Depends (validate_json_request )])
523546@with_cancellation
547+ @load_aware_call
524548async def create_score_v1 (request : ScoreRequest , raw_request : Request ):
525549 logger .warning (
526550 "To indicate that Score API is not part of standard OpenAI API, we "
@@ -531,10 +555,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
531555
532556@router .post ("/v1/audio/transcriptions" )
533557@with_cancellation
558+ @load_aware_call
534559async def create_transcriptions (request : Annotated [TranscriptionRequest ,
535560 Form ()],
536561 raw_request : Request ):
537-
538562 handler = transcription (raw_request )
539563 if handler is None :
540564 return base (raw_request ).create_error_response (
@@ -556,6 +580,7 @@ async def create_transcriptions(request: Annotated[TranscriptionRequest,
556580
557581@router .post ("/rerank" , dependencies = [Depends (validate_json_request )])
558582@with_cancellation
583+ @load_aware_call
559584async def do_rerank (request : RerankRequest , raw_request : Request ):
560585 handler = rerank (raw_request )
561586 if handler is None :
@@ -894,6 +919,9 @@ async def init_app_state(
894919 ) if model_config .runner_type == "transcription" else None
895920 state .task = model_config .task
896921
922+ state .enable_server_load_tracking = args .enable_server_load_tracking
923+ state .server_load_metrics = 0
924+
897925
898926def create_server_socket (addr : tuple [str , int ]) -> socket .socket :
899927 family = socket .AF_INET
0 commit comments