diff --git a/chat.go b/chat.go index c8a3e81b3..08688eadc 100644 --- a/chat.go +++ b/chat.go @@ -280,6 +280,8 @@ type ChatCompletionRequest struct { // Such as think mode for qwen3. "chat_template_kwargs": {"enable_thinking": false} // https://qwen.readthedocs.io/en/latest/deployment/vllm.html#thinking-non-thinking-modes ChatTemplateKwargs map[string]any `json:"chat_template_kwargs,omitempty"` + // Specifies the latency tier to use for processing the request. + ServiceTier ServiceTier `json:"service_tier,omitempty"` } type StreamOptions struct { @@ -363,6 +365,15 @@ const ( FinishReasonNull FinishReason = "null" ) +type ServiceTier string + +const ( + ServiceTierAuto ServiceTier = "auto" + ServiceTierDefault ServiceTier = "default" + ServiceTierFlex ServiceTier = "flex" + ServiceTierPriority ServiceTier = "priority" +) + func (r FinishReason) MarshalJSON() ([]byte, error) { if r == FinishReasonNull || r == "" { return []byte("null"), nil @@ -395,6 +406,7 @@ type ChatCompletionResponse struct { Usage Usage `json:"usage"` SystemFingerprint string `json:"system_fingerprint"` PromptFilterResults []PromptFilterResult `json:"prompt_filter_results,omitempty"` + ServiceTier ServiceTier `json:"service_tier,omitempty"` httpHeader }