@@ -380,6 +380,7 @@ def generate(
380380 lora_request : Optional [LoRARequest ] = None ,
381381 trace_headers : Optional [Mapping [str , str ]] = None ,
382382 prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
383+ priority : int = 0 ,
383384 ) -> AsyncGenerator [RequestOutput , None ]:
384385 ...
385386
@@ -392,6 +393,7 @@ def generate(
392393 lora_request : Optional [LoRARequest ] = None ,
393394 trace_headers : Optional [Mapping [str , str ]] = None ,
394395 prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
396+ priority : int = 0 ,
395397 ) -> AsyncGenerator [RequestOutput , None ]:
396398 ...
397399
@@ -407,6 +409,7 @@ def generate(
407409 lora_request : Optional [LoRARequest ] = None ,
408410 trace_headers : Optional [Mapping [str , str ]] = None ,
409411 prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
412+ priority : int = 0 ,
410413 * ,
411414 inputs : Optional [PromptType ] = None # DEPRECATED
412415 ) -> AsyncGenerator [RequestOutput , None ]:
@@ -425,6 +428,9 @@ def generate(
425428 trace_headers: OpenTelemetry trace headers.
426429 prompt_adapter_request: Prompt Adapter request to use
427430 for generation, if any.
431+ priority: Priority of the request (lower means earlier handling).
432+ Any priority other than 0 will lead to an error if the
433+ scheduling policy is not "priority".
428434 """
429435 if inputs is not None :
430436 prompt = inputs
@@ -433,7 +439,7 @@ def generate(
433439
434440 return self ._process_request (prompt , sampling_params , request_id ,
435441 lora_request , trace_headers ,
436- prompt_adapter_request )
442+ prompt_adapter_request , priority )
437443
438444 @overload # DEPRECATED
439445 def encode (
@@ -444,6 +450,7 @@ def encode(
444450 request_id : str ,
445451 lora_request : Optional [LoRARequest ] = None ,
446452 trace_headers : Optional [Mapping [str , str ]] = None ,
453+ priority : int = 0 ,
447454 ) -> AsyncGenerator [EmbeddingRequestOutput , None ]:
448455 ...
449456
@@ -455,6 +462,7 @@ def encode(
455462 request_id : str ,
456463 lora_request : Optional [LoRARequest ] = None ,
457464 trace_headers : Optional [Mapping [str , str ]] = None ,
465+ priority : int = 0 ,
458466 ) -> AsyncGenerator [EmbeddingRequestOutput , None ]:
459467 ...
460468
@@ -469,6 +477,7 @@ def encode(
469477 request_id : Optional [str ] = None ,
470478 lora_request : Optional [LoRARequest ] = None ,
471479 trace_headers : Optional [Mapping [str , str ]] = None ,
480+ priority : int = 0 ,
472481 * ,
473482 inputs : Optional [PromptType ] = None # DEPRECATED
474483 ) -> AsyncGenerator [EmbeddingRequestOutput , None ]:
@@ -496,7 +505,7 @@ def encode(
496505 and request_id is not None )
497506
498507 return self ._process_request (prompt , pooling_params , request_id ,
499- lora_request , trace_headers )
508+ lora_request , trace_headers , priority )
500509
501510 async def _process_request (
502511 self ,
@@ -505,7 +514,8 @@ async def _process_request(
505514 request_id : str ,
506515 lora_request : Optional [LoRARequest ] = None ,
507516 trace_headers : Optional [Mapping [str , str ]] = None ,
508- prompt_adapter_request : Optional [PromptAdapterRequest ] = None
517+ prompt_adapter_request : Optional [PromptAdapterRequest ] = None ,
518+ priority : int = 0 ,
509519 ) -> Union [AsyncGenerator [RequestOutput , None ], AsyncGenerator [
510520 EmbeddingRequestOutput , None ]]:
511521 """Send an RPCGenerateRequest to the RPCServer and stream responses."""
@@ -550,7 +560,9 @@ async def _process_request(
550560 request_id = request_id ,
551561 lora_request = lora_request ,
552562 trace_headers = trace_headers ,
553- prompt_adapter_request = prompt_adapter_request ))
563+ prompt_adapter_request = prompt_adapter_request ,
564+ priority = priority ,
565+ ))
554566
555567 # 3) Send the RPCGenerateRequest to the MQLLMEngine.
556568 parts = (request_bytes ,
0 commit comments