Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
"vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
Expand Down
21 changes: 19 additions & 2 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,23 @@
from vllm.sampling_params import SamplingParams


class RequestFinishedReason(enum.IntEnum):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WDYT about a shorter name FinishReason since it's used in quite a few places?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, sounds good!

"""
Reason a request finished - stop, length, or abort.

stop - a stop string was emitted
length - max_tokens was consumed, or max_model_len was reached
abort - aborted for another reason

"""
STOP = 0
LENGTH = 1
ABORT = 2

def __str__(self):
return self.name.lower()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will create a new string every time it's accessed (and convert to lower). We could instead have a global lookup like:

FINISH_REASON_STRINGS = ("stop", "length", "abort")

class RequestFinishedReason(enum.IntEnum):
    # ...

    def __str__(self):
        return FINISH_REASON_STRINGS[self.value]

I think we should also mention here that the specific string names form part of the API and so shouldn't be changed (i.e. not just arbitrary identifiers).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, great catch on the optimization 👍



@dataclass
class EngineCoreRequest:

Expand Down Expand Up @@ -43,7 +60,7 @@ class EngineCoreOutput(
request_id: str
new_token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None


Expand All @@ -54,7 +71,7 @@ class EngineCoreOutputs(
gc=False): # type: ignore[call-arg]

#NOTE(Nick): We could consider ways to make this more compact,
# e.g. columnwise layout and using an int enum for finish/stop reason
# e.g. columnwise layout

# [num_reqs]
outputs: List[EngineCoreOutput]
Expand Down
9 changes: 5 additions & 4 deletions vllm/v1/engine/detokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from vllm.sampling_params import RequestOutputKind
from vllm.transformers_utils.detokenizer_utils import (
AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
RequestFinishedReason)

logger = init_logger(__name__)

Expand All @@ -16,7 +17,7 @@ class DetokenizerOutput:
output_text: str
token_ids: List[int]
finished: bool
finish_reason: Optional[str] = None
finish_reason: Optional[RequestFinishedReason] = None
stop_reason: Union[int, str, None] = None


Expand Down Expand Up @@ -145,13 +146,13 @@ def update_from_output(
stop_str, truncate_to = stop
if truncate_to != -1:
self.output_text = self.output_text[:truncate_to]
finish_reason = "stop" # TODO: use constant
finish_reason = RequestFinishedReason.STOP
stop_reason = stop_str

# TODO: handle stop_token_ids here too?

# 3) Update the RequestOutput object with the new text.
finished = bool(finish_reason)
finished = finish_reason is not None
if self.output_kind == RequestOutputKind.FINAL_ONLY \
and not finished:
return None
Expand Down
22 changes: 12 additions & 10 deletions vllm/v1/engine/output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,10 @@ def process_outputs(
engine_core_output)

# 3) Create and handle RequestOutput objects.
if request_output := self._make_request_output(
req_state, detokenizer_output):
if detokenizer_output is not None:
request_output = self._make_request_output(
req_state, detokenizer_output)

if req_state.queue is not None:
# AsyncLLM: put into queue for handling by generate().
req_state.queue.put_nowait(request_output)
Expand All @@ -170,6 +172,8 @@ def process_outputs(

# Free completed requests.
if request_output.finished:
assert detokenizer_output.finish_reason is not None

self.request_states.pop(req_id)
if not engine_core_output.finished:
# If req not finished in EngineCore, but Detokenizer
Expand All @@ -178,7 +182,8 @@ def process_outputs(

# Track per-request stats
iteration_stats.update_from_finished_request(
request_output, req_state.stats)
detokenizer_output.finish_reason, request_output,
req_state.stats)

return OutputProcessorOutput(
request_outputs=request_outputs,
Expand All @@ -189,12 +194,8 @@ def process_outputs(
@staticmethod
def _make_request_output(
request_state: RequestState,
detokenizer_output: Optional[DetokenizerOutput],
) -> Optional[RequestOutput]:

if detokenizer_output is None:
return None

detokenizer_output: DetokenizerOutput,
) -> RequestOutput:
request_output = RequestOutput.new(
request_state.request_id,
request_state.prompt,
Expand All @@ -205,7 +206,8 @@ def _make_request_output(
)
if detokenizer_output.finished:
completion_output = request_output.outputs[0]
completion_output.finish_reason = detokenizer_output.finish_reason
completion_output.finish_reason = str(
detokenizer_output.finish_reason)
completion_output.stop_reason = detokenizer_output.stop_reason

return request_output
15 changes: 14 additions & 1 deletion vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import time
from abc import ABC, abstractmethod
from typing import List
from typing import Dict, List

import numpy as np
import prometheus_client

from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.v1.engine import RequestFinishedReason
from vllm.v1.metrics.stats import IterationStats, SchedulerStats

logger = init_logger(__name__)
Expand Down Expand Up @@ -114,6 +115,17 @@ def __init__(self, model_config: ModelConfig):
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)

self.counter_request_success: Dict[RequestFinishedReason,
prometheus_client.Counter] = {}
counter_request_success_base = prometheus_client.Counter(
name="vllm:request_success_total",
documentation="Count of successfully processed requests.",
labelnames=labelnames + ["finished_reason"])
for reason in RequestFinishedReason:
self.counter_request_success[
reason] = counter_request_success_base.labels(*(labelvalues +
[str(reason)]))

self.histogram_num_prompt_tokens_request = \
prometheus_client.Histogram(
name="vllm:request_prompt_tokens",
Expand Down Expand Up @@ -161,6 +173,7 @@ def log(self, scheduler_stats: SchedulerStats,
iteration_stats.num_generation_tokens)

for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason].inc()
self.histogram_num_prompt_tokens_request.observe(
finished_request.num_prompt_tokens)
self.histogram_num_generation_tokens_request.observe(
Expand Down
10 changes: 7 additions & 3 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

if TYPE_CHECKING:
from vllm.outputs import RequestOutput
from vllm.v1.engine import EngineCoreOutput
from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason


@dataclass
Expand All @@ -30,6 +30,7 @@ class RequestStateStats:
class FinishedRequestStats:
"""Stats associated with a finished request."""

finish_reason: "RequestFinishedReason"
num_prompt_tokens: int = 0
num_generation_tokens: int = 0

Expand Down Expand Up @@ -71,8 +72,11 @@ def update_from_output(self, output: "EngineCoreOutput",
request_state_stats.num_generation_tokens += num_new_generation_tokens
request_state_stats.last_token_time = now

def update_from_finished_request(self, request_output: "RequestOutput",
def update_from_finished_request(self,
finish_reason: "RequestFinishedReason",
request_output: "RequestOutput",
request_state_stats: RequestStateStats):
self.finished_requests.append(
FinishedRequestStats(len(request_output.prompt_token_ids),
FinishedRequestStats(finish_reason,
len(request_output.prompt_token_ids),
request_state_stats.num_generation_tokens))
15 changes: 8 additions & 7 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.sequence import RequestMetrics
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
from vllm.v1.utils import ConstantList

if TYPE_CHECKING:
Expand Down Expand Up @@ -106,7 +106,7 @@ def num_output_tokens(self) -> int:
def is_finished(self) -> bool:
return RequestStatus.is_finished(self.status)

def get_finished_reason(self) -> Union[str, None]:
def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
return RequestStatus.get_finished_reason(self.status)

def has_encoder_inputs(self) -> bool:
Expand Down Expand Up @@ -150,7 +150,8 @@ def is_finished(status: "RequestStatus") -> bool:
return status > RequestStatus.PREEMPTED

@staticmethod
def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
def get_finished_reason(
status: "RequestStatus") -> Union[RequestFinishedReason, None]:
return _FINISHED_REASON_MAP.get(status)


Expand All @@ -159,8 +160,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
_FINISHED_REASON_MAP = {
RequestStatus.FINISHED_STOPPED: "stop",
RequestStatus.FINISHED_LENGTH_CAPPED: "length",
RequestStatus.FINISHED_ABORTED: "abort",
RequestStatus.FINISHED_IGNORED: "length",
RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
}