From 14f13ed6905c1bdacc9da12cc0c56bedfeb9e71e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 19 Jul 2025 16:27:38 +0000 Subject: [PATCH 01/51] added debug logging Signed-off-by: Robert Shaw --- .../distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 0c5986bfafaa..15dedcb48404 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -874,6 +874,8 @@ def _pop_done_transfers( Returns: set of req_ids that have all done xfers """ + current_time = time.perf_counter() + done_req_ids: set[str] = set() for req_id, handles in list(transfers.items()): in_progress = False @@ -881,6 +883,8 @@ def _pop_done_transfers( xfer_state = self.nixl_wrapper.check_xfer_state(handle) if xfer_state == "DONE": self.nixl_wrapper.release_xfer_handle(handle) + logger.debug("[nixl] req_id: %s transfer time: %s", + req_id, current_time - _xfer_stime) elif xfer_state == "PROC": in_progress = True continue From b90d33163c28b60768d0f1ad03c534780a14f68d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:15:19 +0000 Subject: [PATCH 02/51] updated Signed-off-by: Robert Shaw --- vllm/engine/arg_utils.py | 16 +++++++++------- vllm/entrypoints/cli/serve.py | 10 +++++----- .../model_loader/weight_utils.py | 2 +- vllm/v1/engine/core.py | 9 +++++++++ vllm/v1/engine/core_client.py | 3 +++ vllm/v1/engine/utils.py | 19 ++++++++++++++++--- 6 files changed, 43 insertions(+), 16 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1ca4917de26b..c3262d6e277e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1091,13 +1091,15 @@ def create_engine_config( # but we should not do this here. placement_group = ray.util.get_current_placement_group() - data_parallel_external_lb = self.data_parallel_rank is not None - if data_parallel_external_lb: - assert self.data_parallel_size_local in (1, None), ( - "data_parallel_size_local must be 1 when data_parallel_rank " - "is set") - data_parallel_size_local = 1 - elif self.data_parallel_size_local is not None: + # data_parallel_external_lb = self.data_parallel_rank is not None + # if data_parallel_external_lb: + # assert self.data_parallel_size_local in (1, None), ( + # "data_parallel_size_local must be 1 when data_parallel_rank " + # "is set") + # data_parallel_size_local = 1 + # elif self.data_parallel_size_local is not None: + data_parallel_external_lb = False + if self.data_parallel_size_local is not None: data_parallel_size_local = self.data_parallel_size_local else: # Local DP size defaults to global DP size if not set. diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 1204ccc1c679..6c01e24b166f 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -45,11 +45,11 @@ def cmd(args: argparse.Namespace) -> None: if args.headless or args.api_server_count < 1: run_headless(args) else: - if args.data_parallel_start_rank: - raise ValueError( - "data_parallel_start_rank is only applicable " - "in headless mode. " - "Add --headless flag to enable headless mode.") + # if args.data_parallel_start_rank: + # raise ValueError( + # "data_parallel_start_rank is only applicable " + # "in headless mode. " + # "Add --headless flag to enable headless mode.") if args.api_server_count > 1: run_multi_api_server(args) else: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 64a2089921ee..dc1bc72ab410 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -303,7 +303,7 @@ def download_weights_from_hf( allow_patterns=allow_patterns, ignore_patterns=ignore_patterns, cache_dir=cache_dir, - tqdm_class=DisabledTqdm, + # tqdm_class=DisabledTqdm, revision=revision, local_files_only=local_only, ) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ca636bf5a6f7..73201b3f35b3 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -411,10 +411,12 @@ def __init__( identity = self.engine_index.to_bytes(length=2, byteorder="little") self.engines_running = False + logger.info("======= HANDSHAKING:") with self._perform_handshakes(handshake_address, identity, local_client, vllm_config, client_handshake_address) as addresses: self.client_count = len(addresses.outputs) + logger.info(f"{addresses.outputs=}") # Set up data parallel environment. self.has_coordinator = addresses.coordinator_output is not None @@ -482,16 +484,21 @@ def _perform_handshakes( """ input_ctx = zmq.Context() is_local = local_client and client_handshake_address is None + logger.info(f"HS: {handshake_address=}, {is_local=}") handshake = self._perform_handshake(input_ctx, handshake_address, identity, is_local, vllm_config, vllm_config.parallel_config) + logger.info(f"DONE HS: {handshake=}") if client_handshake_address is None: with handshake as addresses: yield addresses else: + logger.info(f"HS: {client_handshake_address=}, {local_client=}") local_handshake = self._perform_handshake( input_ctx, client_handshake_address, identity, local_client, vllm_config) + logger.info(f"DONE HS: {local_handshake=}") + with handshake as addresses, local_handshake as client_addresses: addresses.inputs = client_addresses.inputs addresses.outputs = client_addresses.outputs @@ -517,6 +524,8 @@ def _perform_handshake( linger=5000, bind=False) as handshake_socket: # Register engine with front-end. + logger.info(f"calling startup_handshake: {handshake_address=}") + logger.info(f"calling startup_handshake: {local_client=}") addresses = self.startup_handshake(handshake_socket, local_client, parallel_config_to_update) yield addresses diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 82fc1fa9937c..ac4d33aac659 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -405,12 +405,15 @@ def __init__( "stats_update_address") else: # Engines are managed by this client. + print(f"{vllm_config.parallel_config=}") with launch_core_engines(vllm_config, executor_class, log_stats) as (engine_manager, coordinator, addresses): self.resources.coordinator = coordinator self.resources.engine_manager = engine_manager + print("========================================") + print(f"{vllm_config.parallel_config=}") (input_address, ) = addresses.inputs (output_address, ) = addresses.outputs diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 6dde477576b8..d5988a40945a 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -555,6 +555,8 @@ def launch_core_engines( # sends requests only to colocated engines. client_local_only = offline_mode or external_dp_lb or (local_engine_count == dp_size) + # HACK: handle case with one pod per node. + client_local_only = True # Set up input and output addresses. addresses = EngineZmqAddresses( @@ -601,11 +603,17 @@ def launch_core_engines( if offline_mode or (external_dp_lb and dp_rank > 0): assert local_engine_count == 1 engines_to_handshake = [CoreEngine(index=dp_rank, local=True)] - else: + elif dp_rank == 0: engines_to_handshake = [ CoreEngine(index=i, local=(i < local_engine_count)) for i in range(dp_size) ] + else: + # Just handshake with local engines. + engines_to_handshake = [ + CoreEngine(index=i, local=True) for i in + range(dp_rank, dp_rank + local_engine_count) + ] # Whether the started engines will handshake only with co-located # front-end processes. In external_dp_lb mode, ranks > 0 handshake with @@ -616,7 +624,8 @@ def launch_core_engines( handshake_address = get_engine_client_zmq_addr( handshake_local_only, host, parallel_config.data_parallel_rpc_port) - if external_dp_lb and dp_rank > 0: + # if external_dp_lb and dp_rank > 0: + if dp_rank > 0: assert not handshake_local_only local_handshake_address = get_open_zmq_ipc_path() client_handshake_address = local_handshake_address @@ -624,15 +633,18 @@ def launch_core_engines( local_handshake_address = handshake_address client_handshake_address = None + print(f"{local_handshake_address=}") with zmq_socket_ctx(local_handshake_address, zmq.ROUTER, bind=True) as handshake_socket: from vllm.v1.engine.core import EngineCoreProc + print(f"{client_handshake_address=}") + print(f"{handshake_address=}") # Start local engines. if local_engine_count: # In server mode, start_index and local_start_index will - # both be 0. + # both be 0. << todo: update local_engine_manager = CoreEngineProcManager( EngineCoreProc.run_engine_core, vllm_config=vllm_config, @@ -650,6 +662,7 @@ def launch_core_engines( yield local_engine_manager, coordinator, addresses # Now wait for engines to start. + print(f"{engines_to_handshake=}") wait_for_engine_startup( handshake_socket, addresses, From aefeeed64d3b7a33563ce63cac5ee2203ec56179 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:18:01 +0000 Subject: [PATCH 03/51] updated Signed-off-by: Robert Shaw --- .../distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 15dedcb48404..0c5986bfafaa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -874,8 +874,6 @@ def _pop_done_transfers( Returns: set of req_ids that have all done xfers """ - current_time = time.perf_counter() - done_req_ids: set[str] = set() for req_id, handles in list(transfers.items()): in_progress = False @@ -883,8 +881,6 @@ def _pop_done_transfers( xfer_state = self.nixl_wrapper.check_xfer_state(handle) if xfer_state == "DONE": self.nixl_wrapper.release_xfer_handle(handle) - logger.debug("[nixl] req_id: %s transfer time: %s", - req_id, current_time - _xfer_stime) elif xfer_state == "PROC": in_progress = True continue From 59a958362f8b26568a10b6c31b706e74ba488542 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:19:30 +0000 Subject: [PATCH 04/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 73201b3f35b3..fe51924c7784 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -484,20 +484,16 @@ def _perform_handshakes( """ input_ctx = zmq.Context() is_local = local_client and client_handshake_address is None - logger.info(f"HS: {handshake_address=}, {is_local=}") handshake = self._perform_handshake(input_ctx, handshake_address, identity, is_local, vllm_config, vllm_config.parallel_config) - logger.info(f"DONE HS: {handshake=}") if client_handshake_address is None: with handshake as addresses: yield addresses else: - logger.info(f"HS: {client_handshake_address=}, {local_client=}") local_handshake = self._perform_handshake( input_ctx, client_handshake_address, identity, local_client, vllm_config) - logger.info(f"DONE HS: {local_handshake=}") with handshake as addresses, local_handshake as client_addresses: addresses.inputs = client_addresses.inputs @@ -524,8 +520,6 @@ def _perform_handshake( linger=5000, bind=False) as handshake_socket: # Register engine with front-end. - logger.info(f"calling startup_handshake: {handshake_address=}") - logger.info(f"calling startup_handshake: {local_client=}") addresses = self.startup_handshake(handshake_socket, local_client, parallel_config_to_update) yield addresses From 48cf09be0bd5de084de6d537322ef6c505831e72 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:19:52 +0000 Subject: [PATCH 05/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index fe51924c7784..80ba91e2e7f1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -411,12 +411,10 @@ def __init__( identity = self.engine_index.to_bytes(length=2, byteorder="little") self.engines_running = False - logger.info("======= HANDSHAKING:") with self._perform_handshakes(handshake_address, identity, local_client, vllm_config, client_handshake_address) as addresses: self.client_count = len(addresses.outputs) - logger.info(f"{addresses.outputs=}") # Set up data parallel environment. self.has_coordinator = addresses.coordinator_output is not None From 2fd05875d4075f056e2ba16ce17f90fcc1139d57 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:20:36 +0000 Subject: [PATCH 06/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core.py | 1 - vllm/v1/engine/core_client.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 80ba91e2e7f1..ca636bf5a6f7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -492,7 +492,6 @@ def _perform_handshakes( local_handshake = self._perform_handshake( input_ctx, client_handshake_address, identity, local_client, vllm_config) - with handshake as addresses, local_handshake as client_addresses: addresses.inputs = client_addresses.inputs addresses.outputs = client_addresses.outputs diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ac4d33aac659..82fc1fa9937c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -405,15 +405,12 @@ def __init__( "stats_update_address") else: # Engines are managed by this client. - print(f"{vllm_config.parallel_config=}") with launch_core_engines(vllm_config, executor_class, log_stats) as (engine_manager, coordinator, addresses): self.resources.coordinator = coordinator self.resources.engine_manager = engine_manager - print("========================================") - print(f"{vllm_config.parallel_config=}") (input_address, ) = addresses.inputs (output_address, ) = addresses.outputs From 14cf3c4786b28ea72537604b4f7abc1e00a7905c Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:20:54 +0000 Subject: [PATCH 07/51] updated Signed-off-by: Robert Shaw --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index dc1bc72ab410..64a2089921ee 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -303,7 +303,7 @@ def download_weights_from_hf( allow_patterns=allow_patterns, ignore_patterns=ignore_patterns, cache_dir=cache_dir, - # tqdm_class=DisabledTqdm, + tqdm_class=DisabledTqdm, revision=revision, local_files_only=local_only, ) From 4f5d3eabc81a825ef46a8305bef104989e569e38 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:21:19 +0000 Subject: [PATCH 08/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index d5988a40945a..ddf0dd4edeab 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -638,8 +638,6 @@ def launch_core_engines( bind=True) as handshake_socket: from vllm.v1.engine.core import EngineCoreProc - print(f"{client_handshake_address=}") - print(f"{handshake_address=}") # Start local engines. if local_engine_count: From 14db6606f2b40afce0d7101dceb88b2495c9ebdc Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:21:51 +0000 Subject: [PATCH 09/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index ddf0dd4edeab..d45a2d80d2d6 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -633,7 +633,6 @@ def launch_core_engines( local_handshake_address = handshake_address client_handshake_address = None - print(f"{local_handshake_address=}") with zmq_socket_ctx(local_handshake_address, zmq.ROUTER, bind=True) as handshake_socket: From 2aa497571d8c0c6c9c261f50adfb1084b6b51636 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:23:34 +0000 Subject: [PATCH 10/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index d45a2d80d2d6..2ac1f750381c 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -659,7 +659,6 @@ def launch_core_engines( yield local_engine_manager, coordinator, addresses # Now wait for engines to start. - print(f"{engines_to_handshake=}") wait_for_engine_startup( handshake_socket, addresses, From b142571366c685bfbef502ff8f77c0e566099281 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:24:49 +0000 Subject: [PATCH 11/51] cleanup Signed-off-by: Robert Shaw --- vllm/entrypoints/cli/serve.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 6c01e24b166f..1204ccc1c679 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -45,11 +45,11 @@ def cmd(args: argparse.Namespace) -> None: if args.headless or args.api_server_count < 1: run_headless(args) else: - # if args.data_parallel_start_rank: - # raise ValueError( - # "data_parallel_start_rank is only applicable " - # "in headless mode. " - # "Add --headless flag to enable headless mode.") + if args.data_parallel_start_rank: + raise ValueError( + "data_parallel_start_rank is only applicable " + "in headless mode. " + "Add --headless flag to enable headless mode.") if args.api_server_count > 1: run_multi_api_server(args) else: From e1843b7e6c23eceea0b0ee64c619eae4a5d95900 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:30:23 +0000 Subject: [PATCH 12/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 2ac1f750381c..ac3acafbe01d 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -640,8 +640,8 @@ def launch_core_engines( # Start local engines. if local_engine_count: - # In server mode, start_index and local_start_index will - # both be 0. << todo: update + # In server mode, start_index is the dp_rank and + # local_start_index is 0. local_engine_manager = CoreEngineProcManager( EngineCoreProc.run_engine_core, vllm_config=vllm_config, From d2d54e9c7284fd2ac08e5960fbf9cfe9a227e195 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 02:46:54 +0000 Subject: [PATCH 13/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 82fc1fa9937c..2d574efd03c5 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -429,11 +429,12 @@ def __init__( parallel_config = vllm_config.parallel_config dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank + dp_local_size = parallel_config.data_parallel_size_local external_dp_lb = parallel_config.data_parallel_external_lb offline_mode = parallel_config.data_parallel_rank_local is not None - engine_ranks = [dp_rank] if (offline_mode - or external_dp_lb) else range(dp_size) + engine_ranks = ([dp_rank] if (offline_mode or external_dp_lb) + else range(dp_rank, dp_rank + dp_local_size)) assert parallel_config.data_parallel_size_local <= len( engine_ranks) From 4438796b4831d9bfea6fa843a85373f9fc33da39 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:44:38 +0000 Subject: [PATCH 14/51] fix lb issues Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 1 + vllm/v1/engine/core_client.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6395d2c1875b..6736ed727345 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -325,6 +325,7 @@ async def generate( # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() or await q.get() + # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 2d574efd03c5..127c4fb8473c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -709,8 +709,12 @@ def _ensure_output_queue_task(self): assert output_socket is not None async def process_outputs_socket(): + i = 0 try: while True: + if i % 100 == 0: + logger.info(f"{i=}") + i += 1 frames = await output_socket.recv_multipart(copy=False) resources.validate_alive(frames) outputs: EngineCoreOutputs = decoder.decode(frames) @@ -895,6 +899,8 @@ def _ensure_stats_update_task(self): return assert self.stats_update_address is not None + dp_start_rank = self.vllm_config.parallel_config.data_parallel_rank + dp_end_rank = dp_start_rank + self.vllm_config.parallel_config.data_parallel_size_local async def run_engine_stats_update_task(): with make_zmq_socket(self.ctx, self.stats_update_address, @@ -959,7 +965,7 @@ async def run_engine_stats_update_task(): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running - self.lb_engines = counts + self.lb_engines = counts[dp_start_rank:dp_end_rank] resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) @@ -973,6 +979,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: chosen_engine = self.get_core_engine_for_request(request) to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine) + if not self.engines_running: # Notify coordinator that we're sending a request req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine)) @@ -1007,6 +1014,9 @@ def __init__(self, def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: + logger.info(f"{self.lb_engines=} | {request.data_parallel_rank=}") + logger.info(f"{self.core_engine=}") + logger.info(f"{self.client_index=}") # Engines are in rank order. if (eng_index := request.data_parallel_rank) is None: if not self.lb_engines: From 2a68433a82140542bc186c649c2a359b853bd533 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:45:48 +0000 Subject: [PATCH 15/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 127c4fb8473c..3532ab1bfb24 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -709,12 +709,8 @@ def _ensure_output_queue_task(self): assert output_socket is not None async def process_outputs_socket(): - i = 0 try: while True: - if i % 100 == 0: - logger.info(f"{i=}") - i += 1 frames = await output_socket.recv_multipart(copy=False) resources.validate_alive(frames) outputs: EngineCoreOutputs = decoder.decode(frames) From 1ced153eecbe4c7669cc6e242e7ab1ea48940626 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:47:23 +0000 Subject: [PATCH 16/51] updatedd Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 3532ab1bfb24..5a256895ca7e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -961,6 +961,8 @@ async def run_engine_stats_update_task(): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running + # NOTE: counts includes num running for all global + # EngineCores, so need to slide for the local ones. self.lb_engines = counts[dp_start_rank:dp_end_rank] resources.stats_update_task = asyncio.create_task( From b9c0f658ca5db7f1562f24cbb2d06b5a79fbb194 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:47:45 +0000 Subject: [PATCH 17/51] nits Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6736ed727345..6395d2c1875b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -325,7 +325,6 @@ async def generate( # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() or await q.get() - # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. From dbc51d6e98409ed38c23773efa8407df687ab458 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:48:11 +0000 Subject: [PATCH 18/51] nits Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 5a256895ca7e..1662577b837d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -977,7 +977,6 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: chosen_engine = self.get_core_engine_for_request(request) to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine) - if not self.engines_running: # Notify coordinator that we're sending a request req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine)) From 471fa4ae688b065039758e8816b53bb0ed414825 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 03:54:43 +0000 Subject: [PATCH 19/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 1662577b837d..13d0ee19d16b 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1011,9 +1011,6 @@ def __init__(self, def get_core_engine_for_request( self, request: EngineCoreRequest) -> EngineIdentity: - logger.info(f"{self.lb_engines=} | {request.data_parallel_rank=}") - logger.info(f"{self.core_engine=}") - logger.info(f"{self.client_index=}") # Engines are in rank order. if (eng_index := request.data_parallel_rank) is None: if not self.lb_engines: From 6569facd3be05817f713fcf8c7dc13e9f9eba2f3 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 13:34:38 +0000 Subject: [PATCH 20/51] stash Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 2 ++ vllm/v1/metrics/loggers.py | 1 + 2 files changed, 3 insertions(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6395d2c1875b..a76ad96bf88e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -410,6 +410,8 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. + logger.info(f"{outputs.engine_index=}") + logger.info(f"{stat_loggers[outputs.engine_index]=}") if stat_loggers: AsyncLLM._record_stats( stat_loggers[outputs.engine_index], diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c720ca13e51b..8d879b5e98ed 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -422,6 +422,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): + logger.info(f"PROM STAT LOGGER: {self.engine_index=}") """Log to prometheus.""" if scheduler_stats is not None: self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) From 1e5303a8017c063b7d79f2400e3f0043e05ad476 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 13:37:34 +0000 Subject: [PATCH 21/51] stash Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8d879b5e98ed..24799dedba29 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -148,7 +148,7 @@ class PrometheusStatLogger(StatLoggerBase): def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - unregister_vllm_metrics() + # unregister_vllm_metrics() self.vllm_config = vllm_config self.engine_index = engine_index # Use this flag to hide metrics that were deprecated in From a69edca3698f6a4bce112c50063e7a712cbd3a02 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 13:52:50 +0000 Subject: [PATCH 22/51] convert to use only one prometheus stat logger per async llm Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 288 +++++++++++++++++++++++-------------- 1 file changed, 181 insertions(+), 107 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 24799dedba29..70442855f7b3 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -146,107 +146,142 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): + def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): # unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_index = engine_index + self.engine_indexes = [str(idx) for idx in engine_indexes] # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ vllm_config.observability_config.show_hidden_metrics labelnames = ["model_name", "engine"] - labelvalues = [ - vllm_config.model_config.served_model_name, - str(engine_index) - ] - + model_name = vllm_config.model_config.served_model_name, max_model_len = vllm_config.model_config.max_model_len - self.spec_decoding_prom = self._spec_decoding_cls( - vllm_config.speculative_config, labelnames, labelvalues) + # self.spec_decoding_prom = self._spec_decoding_cls( + # vllm_config.speculative_config, labelnames, labelvalues) # # Scheduler state # - self.gauge_scheduler_running = self._gauge_cls( - name="vllm:num_requests_running", - documentation="Number of requests in model execution batches.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) - - self.gauge_scheduler_waiting = self._gauge_cls( - name="vllm:num_requests_waiting", - documentation="Number of requests waiting to be processed.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + self.gauge_scheduler_running = { + idx: self._gauge_cls( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + multiprocess_mode="mostrecent", + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + self.gauge_scheduler_waiting = { + self._gauge_cls( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + multiprocess_mode="mostrecent", + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } # # GPU cache # # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True - self.gauge_gpu_cache_usage = self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + self.gauge_gpu_cache_usage = { + idx: self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_queries = self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation= - ("GPU prefix cache queries, in terms of number of queried tokens." - "DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames).labels(*labelvalues) + self.counter_gpu_prefix_cache_queries = { + idx: self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried tokens." + "DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_hits = self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached tokens." - "DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(*labelvalues) - - self.gauge_kv_cache_usage = self._gauge_cls( + self.counter_gpu_prefix_cache_hits = { + idx: self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached tokens." + "DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + + self.gauge_kv_cache_usage = { + idx: self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } - self.counter_prefix_cache_queries = self._counter_cls( + self.counter_prefix_cache_queries = { + idx: elf._counter_cls( name="vllm:prefix_cache_queries", documentation=( "Prefix cache queries, in terms of number of queried tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + - self.counter_prefix_cache_hits = self._counter_cls( + self.counter_prefix_cache_hits = { + idx: self._counter_cls( name="vllm:prefix_cache_hits", documentation=( "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + # # Counters # - self.counter_num_preempted_reqs = self._counter_cls( + self.counter_num_preempted_reqs = { + idx: self._counter_cls( name="vllm:num_preemptions", documentation="Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + - self.counter_prompt_tokens = self._counter_cls( + self.counter_prompt_tokens = { + idx: self._counter_cls( name="vllm:prompt_tokens", documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } - self.counter_generation_tokens = self._counter_cls( + + self.counter_generation_tokens = { + idx: self._counter_cls( name="vllm:generation_tokens", documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} @@ -256,30 +291,36 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: self.counter_request_success[ - reason] = counter_request_success_base.labels(*(labelvalues + - [str(reason)])) + reason] = {idx: counter_request_success_base.labels(model_name, idx, str(reason)) for idx in self.engine_indexes} # # Histograms of counts # - self.histogram_num_prompt_tokens_request = \ - self._histogram_cls( + self.histogram_num_prompt_tokens_request = { + idx: self._histogram_cls( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } - self.histogram_num_generation_tokens_request = \ - self._histogram_cls( + + self.histogram_num_generation_tokens_request = { + idx: self._histogram_cls( name="vllm:request_generation_tokens", documentation="Number of generation tokens processed.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = \ + self.histogram_iteration_tokens = {idx: + self._histogram_cls( name="vllm:iteration_tokens_total", documentation="Histogram of number of tokens per engine_step.", @@ -287,34 +328,46 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 ], - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } - self.histogram_max_num_generation_tokens_request = \ - self._histogram_cls( + + self.histogram_max_num_generation_tokens_request = { + idx: self._histogram_cls( name="vllm:request_max_num_generation_tokens", documentation= "Histogram of maximum number of requested generation tokens.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } - self.histogram_n_request = \ + + self.histogram_n_request = {idx: self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + - self.histogram_max_tokens_request = \ + self.histogram_max_tokens_request = {idx: self._histogram_cls( name="vllm:request_params_max_tokens", documentation="Histogram of the max_tokens request parameter.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + # # Histogram of timing intervals # - self.histogram_time_to_first_token = \ + self.histogram_time_to_first_token = {idx: self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -323,9 +376,12 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0 ], - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + - self.histogram_time_per_output_token = \ + self.histogram_time_per_output_token = {idx: self._histogram_cls( name="vllm:time_per_output_token_seconds", documentation="Histogram of time per output token in seconds.", @@ -333,70 +389,88 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 ], - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = \ + self.histogram_e2e_time_request = {idx: self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of e2e request latency in seconds.", buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_queue_time_request = \ + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + self.histogram_queue_time_request = {idx: self._histogram_cls( name="vllm:request_queue_time_seconds", documentation= "Histogram of time spent in WAITING phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_inference_time_request = \ + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + self.histogram_inference_time_request = {idx: self._histogram_cls( name="vllm:request_inference_time_seconds", documentation= "Histogram of time spent in RUNNING phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_prefill_time_request = \ + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + self.histogram_prefill_time_request = {idx: self._histogram_cls( name="vllm:request_prefill_time_seconds", documentation= "Histogram of time spent in PREFILL phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_decode_time_request = \ + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + self.histogram_decode_time_request = {idx: self._histogram_cls( name="vllm:request_decode_time_seconds", documentation= "Histogram of time spent in DECODE phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - - # - # LoRA metrics - # - - # TODO: This metric might be incorrect in case of using multiple - # api_server counts which uses prometheus mp. - self.gauge_lora_info: Optional[prometheus_client.Gauge] = None - if vllm_config.lora_config is not None: - self.labelname_max_lora = "max_lora" - self.labelname_waiting_lora_adapters = "waiting_lora_adapters" - self.labelname_running_lora_adapters = "running_lora_adapters" - self.max_lora = vllm_config.lora_config.max_loras - self.gauge_lora_info = \ - self._gauge_cls( - name="vllm:lora_requests_info", - documentation="Running stats on lora requests.", - multiprocess_mode="sum", - labelnames=[ - self.labelname_max_lora, - self.labelname_waiting_lora_adapters, - self.labelname_running_lora_adapters, - ], - ) + labelnames=labelnames).labels(model_name, idx) + for idx in self.engine_indexes + } + + + # # + # # LoRA metrics + # # + + # # TODO: This metric might be incorrect in case of using multiple + # # api_server counts which uses prometheus mp. + # self.gauge_lora_info: Optional[prometheus_client.Gauge] = None + # if vllm_config.lora_config is not None: + # self.labelname_max_lora = "max_lora" + # self.labelname_waiting_lora_adapters = "waiting_lora_adapters" + # self.labelname_running_lora_adapters = "running_lora_adapters" + # self.max_lora = vllm_config.lora_config.max_loras + # self.gauge_lora_info = \ + # self._gauge_cls( + # name="vllm:lora_requests_info", + # documentation="Running stats on lora requests.", + # multiprocess_mode="sum", + # labelnames=[ + # self.labelname_max_lora, + # self.labelname_waiting_lora_adapters, + # self.labelname_running_lora_adapters, + # ], + # ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): From de91a3cd6a6bbe8c3aef5e55bb63dbb915129cdb Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 14:38:45 +0000 Subject: [PATCH 23/51] convert to use only one prometheus stat logger per async llm Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 64 ++++--- vllm/v1/metrics/loggers.py | 352 +++++++++++++++++++----------------- 2 files changed, 228 insertions(+), 188 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a76ad96bf88e..75e1183c80d0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,8 +36,8 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory, - setup_default_loggers) +from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, + StatLoggerFactory, setup_default_loggers) from vllm.v1.metrics.prometheus import shutdown_prometheus from vllm.v1.metrics.stats import IterationStats, SchedulerStats @@ -96,10 +96,10 @@ def __init__( self.log_stats = log_stats # Set up stat loggers; independent set for each DP rank. - self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( + self.stat_loggers = setup_default_loggers( vllm_config=vllm_config, log_stats=self.log_stats, - engine_num=vllm_config.parallel_config.data_parallel_size, + num_engines=vllm_config.parallel_config.data_parallel_size, custom_stat_loggers=stat_loggers, ) @@ -130,7 +130,8 @@ def __init__( client_index=client_index, ) if self.stat_loggers: - for stat_logger in self.stat_loggers[0]: + per_engine_loggers, _ = self.stat_loggers + for stat_logger in per_engine_loggers[0]: stat_logger.log_engine_initialized() self.output_handler: Optional[asyncio.Task] = None try: @@ -410,11 +411,10 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - logger.info(f"{outputs.engine_index=}") - logger.info(f"{stat_loggers[outputs.engine_index]=}") - if stat_loggers: + if stat_loggers is not None: AsyncLLM._record_stats( - stat_loggers[outputs.engine_index], + stat_loggers, + outputs.engine_index, scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, ) @@ -435,15 +435,22 @@ async def abort(self, request_id: str) -> None: @staticmethod def _record_stats( - stat_loggers: list[StatLoggerBase], + stat_loggers: tuple[list[list[StatLoggerBase]], PrometheusStatLogger], + engine_idx: int, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats], ): """static so that it can be used from the output_handler task without a circular ref to AsyncLLM.""" - for stat_logger in stat_loggers: - stat_logger.record(scheduler_stats=scheduler_stats, + + per_engine_loggers, prom_logger = stat_loggers + for stat_logger in per_engine_loggers[engine_idx]: + stat_logger.record(engine_idx=engine_idx, + scheduler_stats=scheduler_stats, iteration_stats=iteration_stats) + prom_logger.record(engine_idx=engine_idx, + scheduler_stats=scheduler_stats, + iteration_stats=iteration_stats) async def encode( self, @@ -549,8 +556,13 @@ async def do_log_stats( scheduler_outputs=None, model_output=None, ) -> None: - for loggers in self.stat_loggers: - for stat_logger in loggers: + + if self.stat_loggers is None: + return + + per_engine_loggers, _ = self.stat_loggers + for engine_loggers in per_engine_loggers: + for stat_logger in engine_loggers: stat_logger.log() async def check_health(self) -> None: @@ -655,18 +667,18 @@ async def scale_elastic_ep(self, new_data_parallel_size # recreate stat loggers - if new_data_parallel_size > old_data_parallel_size: - stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( - vllm_config=self.vllm_config, - log_stats=self.log_stats, - engine_num=new_data_parallel_size, - custom_stat_loggers=None, - ) - num_new_engines = len(stat_loggers) - len(self.stat_loggers) - self.stat_loggers.extend(stat_loggers[-num_new_engines:]) - else: - for _ in range(old_data_parallel_size - new_data_parallel_size): - self.stat_loggers.pop() + # if new_data_parallel_size > old_data_parallel_size: + # stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( + # vllm_config=self.vllm_config, + # log_stats=self.log_stats, + # engine_num=new_data_parallel_size, + # custom_stat_loggers=None, + # ) + # num_new_engines = len(stat_loggers) - len(self.stat_loggers) + # self.stat_loggers.extend(stat_loggers[-num_new_engines:]) + # else: + # for _ in range(old_data_parallel_size - new_data_parallel_size): + # self.stat_loggers.pop() @property def is_running(self) -> bool: diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 70442855f7b3..c751ac03d8e2 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -13,7 +13,6 @@ from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason -from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm @@ -35,8 +34,10 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ... @abstractmethod - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): ... @abstractmethod @@ -78,8 +79,10 @@ def _get_throughput(self, tracked_stats: list[int], now: float) -> float: # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log Stats to standard output.""" if iteration_stats: @@ -146,11 +149,11 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): + def __init__(self, vllm_config: VllmConfig, num_engines: int = 1): # unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_indexes = [str(idx) for idx in engine_indexes] + self.engine_indexes = range(num_engines) # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ @@ -167,20 +170,22 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): # Scheduler state # self.gauge_scheduler_running = { - idx: self._gauge_cls( + idx: + self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } self.gauge_scheduler_waiting = { + idx: self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } @@ -190,96 +195,102 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True self.gauge_gpu_cache_usage = { - idx: self._gauge_cls( + idx: + self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation=( "GPU KV-cache usage. 1 means 100 percent usage." "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_queries = { - idx: self._counter_cls( + idx: + self._counter_cls( name="vllm:gpu_prefix_cache_queries", documentation=( - "GPU prefix cache queries, in terms of number of queried tokens." - "DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames).labels(model_name, idx) + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." + ), + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_hits = { - idx: self._counter_cls( + idx: + self._counter_cls( name="vllm:gpu_prefix_cache_hits", documentation=( - "GPU prefix cache hits, in terms of number of cached tokens." - "DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(model_name, idx) + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.gauge_kv_cache_usage = { - idx: self._gauge_cls( - name="vllm:kv_cache_usage_perc", - documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(model_name, idx) + idx: + self._gauge_cls( + name="vllm:kv_cache_usage_perc", + documentation="KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } self.counter_prefix_cache_queries = { - idx: elf._counter_cls( - name="vllm:prefix_cache_queries", - documentation=( - "Prefix cache queries, in terms of number of queried tokens."), - labelnames=labelnames).labels(model_name, idx) + idx: + self._counter_cls( + name="vllm:prefix_cache_queries", + documentation= + ("Prefix cache queries, in terms of number of queried tokens." + ), + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.counter_prefix_cache_hits = { - idx: self._counter_cls( - name="vllm:prefix_cache_hits", - documentation=( - "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(model_name, idx) + idx: + self._counter_cls( + name="vllm:prefix_cache_hits", + documentation=( + "Prefix cache hits, in terms of number of cached tokens."), + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - # # Counters # self.counter_num_preempted_reqs = { - idx: self._counter_cls( - name="vllm:num_preemptions", - documentation="Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(model_name, idx) + idx: + self._counter_cls( + name="vllm:num_preemptions", + documentation= + "Cumulative number of preemption from the engine.", + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.counter_prompt_tokens = { - idx: self._counter_cls( - name="vllm:prompt_tokens", - documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(model_name, idx) + idx: + self._counter_cls( + name="vllm:prompt_tokens", + documentation="Number of prefill tokens processed.", + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.counter_generation_tokens = { - idx: self._counter_cls( - name="vllm:generation_tokens", - documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(model_name, idx) + idx: + self._counter_cls( + name="vllm:generation_tokens", + documentation="Number of generation tokens processed.", + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } @@ -290,37 +301,41 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: - self.counter_request_success[ - reason] = {idx: counter_request_success_base.labels(model_name, idx, str(reason)) for idx in self.engine_indexes} + self.counter_request_success[reason] = { + idx: + counter_request_success_base.labels(model_name, str(idx), + str(reason)) + for idx in self.engine_indexes + } # # Histograms of counts # self.histogram_num_prompt_tokens_request = { - idx: self._histogram_cls( + idx: + self._histogram_cls( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_num_generation_tokens_request = { - idx: self._histogram_cls( + idx: + self._histogram_cls( name="vllm:request_generation_tokens", documentation="Number of generation tokens processed.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = {idx: - + self.histogram_iteration_tokens = { + idx: self._histogram_cls( name="vllm:iteration_tokens_total", documentation="Histogram of number of tokens per engine_step.", @@ -328,46 +343,46 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 ], - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_max_num_generation_tokens_request = { - idx: self._histogram_cls( + idx: + self._histogram_cls( name="vllm:request_max_num_generation_tokens", documentation= "Histogram of maximum number of requested generation tokens.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - - self.histogram_n_request = {idx: + self.histogram_n_request = { + idx: self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - - self.histogram_max_tokens_request = {idx: + self.histogram_max_tokens_request = { + idx: self._histogram_cls( name="vllm:request_params_max_tokens", documentation="Histogram of the max_tokens request parameter.", buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - # # Histogram of timing intervals # - self.histogram_time_to_first_token = {idx: + self.histogram_time_to_first_token = { + idx: self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -376,12 +391,12 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 2560.0 ], - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - - self.histogram_time_per_output_token = {idx: + self.histogram_time_per_output_token = { + idx: self._histogram_cls( name="vllm:time_per_output_token_seconds", documentation="Histogram of time per output token in seconds.", @@ -389,65 +404,68 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 ], - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = {idx: + self.histogram_e2e_time_request = { + idx: self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of e2e request latency in seconds.", buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_queue_time_request = {idx: + self.histogram_queue_time_request = { + idx: self._histogram_cls( name="vllm:request_queue_time_seconds", documentation= "Histogram of time spent in WAITING phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_inference_time_request = {idx: + self.histogram_inference_time_request = { + idx: self._histogram_cls( name="vllm:request_inference_time_seconds", documentation= "Histogram of time spent in RUNNING phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_prefill_time_request = {idx: + self.histogram_prefill_time_request = { + idx: self._histogram_cls( name="vllm:request_prefill_time_seconds", documentation= "Histogram of time spent in PREFILL phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - self.histogram_decode_time_request = {idx: + self.histogram_decode_time_request = { + idx: self._histogram_cls( name="vllm:request_decode_time_seconds", documentation= "Histogram of time spent in DECODE phase for request.", buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, idx) + labelnames=labelnames).labels(model_name, str(idx)) for idx in self.engine_indexes } - # # # # LoRA metrics # # @@ -473,105 +491,113 @@ def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): # ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): - - metrics_info = config_obj.metrics_info() - metrics_info["engine"] = self.engine_index - - name, documentation = None, None - if type == "cache_config": - name = "vllm:cache_config_info" - documentation = "Information of the LLMEngine CacheConfig" - assert name is not None, f"Unknown metrics info type {type}" - - # Info type metrics are syntactic sugar for a gauge permanently set to 1 - # Since prometheus multiprocessing mode does not support Info, emulate - # info here with a gauge. - info_gauge = self._gauge_cls( - name=name, - documentation=documentation, - multiprocess_mode="mostrecent", - labelnames=metrics_info.keys(), - ).labels(**metrics_info) - info_gauge.set(1) - - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): - logger.info(f"PROM STAT LOGGER: {self.engine_index=}") + pass + # metrics_info = config_obj.metrics_info() + # metrics_info["engine"] = self.engine_index + + # name, documentation = None, None + # if type == "cache_config": + # name = "vllm:cache_config_info" + # documentation = "Information of the LLMEngine CacheConfig" + # assert name is not None, f"Unknown metrics info type {type}" + + # # Info type metrics are syntactic sugar for a gauge permanently set to 1 # noqa: E501 + # # Since prometheus multiprocessing mode does not support Info, emulate + # # info here with a gauge. + # info_gauge = self._gauge_cls( + # name=name, + # documentation=documentation, + # multiprocess_mode="mostrecent", + # labelnames=metrics_info.keys(), + # ).labels(**metrics_info) + # info_gauge.set(1) + + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log to prometheus.""" if scheduler_stats is not None: - self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) - self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_scheduler_running[engine_idx].set( + scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting[engine_idx].set( + scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage) - self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage) + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) + self.gauge_kv_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries.inc( + self.counter_gpu_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits.inc( + self.counter_gpu_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) - self.counter_prefix_cache_queries.inc( + self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_prefix_cache_hits.inc( + self.counter_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) - if scheduler_stats.spec_decoding_stats is not None: - self.spec_decoding_prom.observe( - scheduler_stats.spec_decoding_stats) + # if scheduler_stats.spec_decoding_stats is not None: + # self.spec_decoding_prom.observe( + # scheduler_stats.spec_decoding_stats) if iteration_stats is None: return - self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs) - self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) - self.counter_generation_tokens.inc( + self.counter_num_preempted_reqs[engine_idx].inc( + iteration_stats.num_preempted_reqs) + self.counter_prompt_tokens[engine_idx].inc( + iteration_stats.num_prompt_tokens) + self.counter_generation_tokens[engine_idx].inc( iteration_stats.num_generation_tokens) - self.histogram_iteration_tokens.observe( + self.histogram_iteration_tokens[engine_idx].observe( iteration_stats.num_prompt_tokens + \ iteration_stats.num_generation_tokens) for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter: - self.histogram_max_num_generation_tokens_request.observe( - max_gen_tokens) + self.histogram_max_num_generation_tokens_request[ + engine_idx].observe(max_gen_tokens) for n_param in iteration_stats.n_params_iter: - self.histogram_n_request.observe(n_param) + self.histogram_n_request[engine_idx].observe(n_param) for ttft in iteration_stats.time_to_first_tokens_iter: - self.histogram_time_to_first_token.observe(ttft) + self.histogram_time_to_first_token[engine_idx].observe(ttft) for tpot in iteration_stats.time_per_output_tokens_iter: - self.histogram_time_per_output_token.observe(tpot) + self.histogram_time_per_output_token[engine_idx].observe(tpot) for finished_request in iteration_stats.finished_requests: - self.counter_request_success[finished_request.finish_reason].inc() - self.histogram_e2e_time_request.observe( + self.counter_request_success[ + finished_request.finish_reason][engine_idx].inc() + self.histogram_e2e_time_request[engine_idx].observe( finished_request.e2e_latency) - self.histogram_queue_time_request.observe( + self.histogram_queue_time_request[engine_idx].observe( finished_request.queued_time) - self.histogram_prefill_time_request.observe( + self.histogram_prefill_time_request[engine_idx].observe( finished_request.prefill_time) - self.histogram_inference_time_request.observe( + self.histogram_inference_time_request[engine_idx].observe( finished_request.inference_time) - self.histogram_decode_time_request.observe( + self.histogram_decode_time_request[engine_idx].observe( finished_request.decode_time) - self.histogram_num_prompt_tokens_request.observe( + self.histogram_num_prompt_tokens_request[engine_idx].observe( finished_request.num_prompt_tokens) - self.histogram_num_generation_tokens_request.observe( + self.histogram_num_generation_tokens_request[engine_idx].observe( finished_request.num_generation_tokens) if finished_request.max_tokens_param: - self.histogram_max_tokens_request.observe( + self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) - if self.gauge_lora_info is not None: - running_lora_adapters = \ - ",".join(iteration_stats.running_lora_adapters.keys()) - waiting_lora_adapters = \ - ",".join(iteration_stats.waiting_lora_adapters.keys()) - lora_info_labels = { - self.labelname_running_lora_adapters: running_lora_adapters, - self.labelname_waiting_lora_adapters: waiting_lora_adapters, - self.labelname_max_lora: self.max_lora, - } - self.gauge_lora_info.labels(**lora_info_labels)\ - .set_to_current_time() + # if self.gauge_lora_info is not None: + # running_lora_adapters = \ + # ",".join(iteration_stats.running_lora_adapters.keys()) + # waiting_lora_adapters = \ + # ",".join(iteration_stats.waiting_lora_adapters.keys()) + # lora_info_labels = { + # self.labelname_running_lora_adapters: running_lora_adapters, + # self.labelname_waiting_lora_adapters: waiting_lora_adapters, + # self.labelname_max_lora: self.max_lora, + # } + # self.gauge_lora_info.labels(**lora_info_labels)\ + # .set_to_current_time() def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) @@ -607,26 +633,28 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: def setup_default_loggers( vllm_config: VllmConfig, log_stats: bool, - engine_num: int, + num_engines: int, custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> list[list[StatLoggerBase]]: +) -> Optional[tuple[list[list[StatLoggerBase]], PrometheusStatLogger]]: """Setup logging and prometheus metrics.""" if not log_stats: - return [] + return None factories: list[StatLoggerFactory] if custom_stat_loggers is not None: factories = custom_stat_loggers else: - factories = [PrometheusStatLogger] + factories = [] if logger.isEnabledFor(logging.INFO): factories.append(LoggingStatLogger) stat_loggers: list[list[StatLoggerBase]] = [] - for i in range(engine_num): + for engine_idx in range(num_engines): per_engine_stat_loggers: list[StatLoggerBase] = [] for logger_factory in factories: - per_engine_stat_loggers.append(logger_factory(vllm_config, i)) + per_engine_stat_loggers.append( + logger_factory(vllm_config, engine_idx)) stat_loggers.append(per_engine_stat_loggers) - return stat_loggers + prom_stat_logger = PrometheusStatLogger(vllm_config, num_engines) + return stat_loggers, prom_stat_logger From e08e1e99eeb29bd89a4b9e5b97f73ee66df74ae8 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 14:41:55 +0000 Subject: [PATCH 24/51] cleanup prometheus logging Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/metrics/loggers.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 75e1183c80d0..c012ef78412a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -99,7 +99,7 @@ def __init__( self.stat_loggers = setup_default_loggers( vllm_config=vllm_config, log_stats=self.log_stats, - num_engines=vllm_config.parallel_config.data_parallel_size, + engine_num=vllm_config.parallel_config.data_parallel_size, custom_stat_loggers=stat_loggers, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c751ac03d8e2..b2a1a7d3f931 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -149,11 +149,11 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, num_engines: int = 1): + def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): # unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_indexes = range(num_engines) + self.engine_indexes = range(engine_num) # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ @@ -633,7 +633,7 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: def setup_default_loggers( vllm_config: VllmConfig, log_stats: bool, - num_engines: int, + engine_num: int, custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, ) -> Optional[tuple[list[list[StatLoggerBase]], PrometheusStatLogger]]: """Setup logging and prometheus metrics.""" @@ -649,12 +649,12 @@ def setup_default_loggers( factories.append(LoggingStatLogger) stat_loggers: list[list[StatLoggerBase]] = [] - for engine_idx in range(num_engines): + for engine_idx in range(engine_num): per_engine_stat_loggers: list[StatLoggerBase] = [] for logger_factory in factories: per_engine_stat_loggers.append( logger_factory(vllm_config, engine_idx)) stat_loggers.append(per_engine_stat_loggers) - prom_stat_logger = PrometheusStatLogger(vllm_config, num_engines) + prom_stat_logger = PrometheusStatLogger(vllm_config, engine_num) return stat_loggers, prom_stat_logger From d39cf9380d73a52921fd974f1e39f0dd6091578f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 14:42:48 +0000 Subject: [PATCH 25/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c012ef78412a..235ac5f67f26 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -411,7 +411,7 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - if stat_loggers is not None: + if stat_loggers: AsyncLLM._record_stats( stat_loggers, outputs.engine_index, From 9a2e26d049ae06dcb1122d61abc2a8644e156483 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 14:44:54 +0000 Subject: [PATCH 26/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 235ac5f67f26..2758c59f49ee 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -556,13 +556,11 @@ async def do_log_stats( scheduler_outputs=None, model_output=None, ) -> None: - if self.stat_loggers is None: return - per_engine_loggers, _ = self.stat_loggers - for engine_loggers in per_engine_loggers: - for stat_logger in engine_loggers: + for loggers in per_engine_loggers: + for stat_logger in loggers: stat_logger.log() async def check_health(self) -> None: From 3956d8ccad3f1eb793f4417e8beb506aa8a487a2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 15:22:43 +0000 Subject: [PATCH 27/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 478 +++++++++++++++++-------------------- 1 file changed, 222 insertions(+), 256 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b2a1a7d3f931..0288fe2f2197 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -4,7 +4,7 @@ import logging import time from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Callable, Optional, Union import numpy as np import prometheus_client @@ -153,15 +153,15 @@ def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): # unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_indexes = range(engine_num) # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ vllm_config.observability_config.show_hidden_metrics labelnames = ["model_name", "engine"] - model_name = vllm_config.model_config.served_model_name, + model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len + engine_indexes = list(range(engine_num)) # self.spec_decoding_prom = self._spec_decoding_cls( # vllm_config.speculative_config, labelnames, labelvalues) @@ -169,133 +169,112 @@ def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): # # Scheduler state # - self.gauge_scheduler_running = { - idx: - self._gauge_cls( - name="vllm:num_requests_running", - documentation="Number of requests in model execution batches.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.gauge_scheduler_waiting = { - idx: - self._gauge_cls( - name="vllm:num_requests_waiting", - documentation="Number of requests waiting to be processed.", - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_scheduler_running = self._gauge_cls( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running, + engine_indexes, + model_name) + + gauge_scheduler_waiting = self._gauge_cls( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting, + engine_indexes, + model_name) # # GPU cache # # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True - self.gauge_gpu_cache_usage = { - idx: - self._gauge_cls( - name="vllm:gpu_cache_usage_perc", - documentation=( - "GPU KV-cache usage. 1 means 100 percent usage." - "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), - multiprocess_mode="mostrecent", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + gauge_gpu_cache_usage = self._gauge_cls( + name="vllm:gpu_cache_usage_perc", + documentation=( + "GPU KV-cache usage. 1 means 100 percent usage." + "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), + multiprocess_mode="mostrecent", + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, + engine_indexes, + model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_queries = { - idx: - self._counter_cls( - name="vllm:gpu_prefix_cache_queries", - documentation=( - "GPU prefix cache queries, in terms of number of queried" - "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead." - ), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_gpu_prefix_cache_queries = self._counter_cls( + name="vllm:gpu_prefix_cache_queries", + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_hits = { - idx: - self._counter_cls( - name="vllm:gpu_prefix_cache_hits", - documentation=( - "GPU prefix cache hits, in terms of number of cached " - "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.gauge_kv_cache_usage = { - idx: - self._gauge_cls( - name="vllm:kv_cache_usage_perc", - documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.counter_prefix_cache_queries = { - idx: - self._counter_cls( - name="vllm:prefix_cache_queries", - documentation= - ("Prefix cache queries, in terms of number of queried tokens." - ), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.counter_prefix_cache_hits = { - idx: - self._counter_cls( - name="vllm:prefix_cache_hits", - documentation=( - "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + counter_gpu_prefix_cache_hits = self._counter_cls( + name="vllm:gpu_prefix_cache_hits", + documentation=( + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) + + gauge_kv_cache_usage = self._gauge_cls( + name="vllm:kv_cache_usage_perc", + documentation="KV-cache usage. 1 means 100 percent usage.", + labelnames=labelnames) + self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage, + engine_indexes, model_name) + + counter_prefix_cache_queries = self._counter_cls( + name="vllm:prefix_cache_queries", + documentation=( + "Prefix cache queries, in terms of number of queried tokens."), + labelnames=labelnames) + self.counter_prefix_cache_queries = make_per_engine( + counter_prefix_cache_queries, engine_indexes, model_name) + + counter_prefix_cache_hits = self._counter_cls( + name="vllm:prefix_cache_hits", + documentation=( + "Prefix cache hits, in terms of number of cached tokens."), + labelnames=labelnames) + self.counter_prefix_cache_hits = make_per_engine( + counter_prefix_cache_hits, engine_indexes, model_name) # # Counters # - self.counter_num_preempted_reqs = { - idx: - self._counter_cls( - name="vllm:num_preemptions", - documentation= - "Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.counter_prompt_tokens = { - idx: - self._counter_cls( - name="vllm:prompt_tokens", - documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.counter_generation_tokens = { - idx: - self._counter_cls( - name="vllm:generation_tokens", - documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.counter_request_success: dict[FinishReason, - prometheus_client.Counter] = {} + counter_num_preempted_reqs = self._counter_cls( + name="vllm:num_preemptions", + documentation="Cumulative number of preemption from the engine.", + labelnames=labelnames) + self.counter_num_preempted_reqs = make_per_engine( + counter_num_preempted_reqs, engine_indexes, model_name) + + counter_prompt_tokens = self._counter_cls( + name="vllm:prompt_tokens", + documentation="Number of prefill tokens processed.", + labelnames=labelnames) + self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens, + engine_indexes, + model_name) + + counter_generation_tokens = self._counter_cls( + name="vllm:generation_tokens", + documentation="Number of generation tokens processed.", + labelnames=labelnames) + self.counter_generation_tokens = make_per_engine( + counter_generation_tokens, engine_indexes, model_name) + + self.counter_request_success: dict[FinishReason, dict[ + int, prometheus_client.Counter]] = {} counter_request_success_base = self._counter_cls( name="vllm:request_success", documentation="Count of successfully processed requests.", @@ -305,166 +284,141 @@ def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): idx: counter_request_success_base.labels(model_name, str(idx), str(reason)) - for idx in self.engine_indexes + for idx in engine_indexes } # # Histograms of counts # - self.histogram_num_prompt_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_prompt_tokens", - documentation="Number of prefill tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_num_generation_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_generation_tokens", - documentation="Number of generation tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_num_prompt_tokens_request = self._histogram_cls( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_prompt_tokens_request = make_per_engine( + histogram_num_prompt_tokens_request, engine_indexes, model_name) + + histogram_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_generation_tokens_request = make_per_engine( + histogram_num_generation_tokens_request, engine_indexes, + model_name) # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = { - idx: - self._histogram_cls( - name="vllm:iteration_tokens_total", - documentation="Histogram of number of tokens per engine_step.", - buckets=[ - 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, - 16384 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_max_num_generation_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_max_num_generation_tokens", - documentation= - "Histogram of maximum number of requested generation tokens.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_n_request = { - idx: - self._histogram_cls( - name="vllm:request_params_n", - documentation="Histogram of the n request parameter.", - buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_max_tokens_request = { - idx: - self._histogram_cls( - name="vllm:request_params_max_tokens", - documentation="Histogram of the max_tokens request parameter.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 + ], + labelnames=labelnames) + self.histogram_iteration_tokens = make_per_engine( + histogram_iteration_tokens, engine_indexes, model_name) + + histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_num_generation_tokens_request = make_per_engine( + histogram_max_num_generation_tokens_request, engine_indexes, + model_name) + + histogram_n_request = self._histogram_cls( + name="vllm:request_params_n", + documentation="Histogram of the n request parameter.", + buckets=[1, 2, 5, 10, 20], + labelnames=labelnames) + self.histogram_n_request = make_per_engine(histogram_n_request, + engine_indexes, model_name) + + histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_tokens_request = make_per_engine( + histogram_max_tokens_request, engine_indexes, model_name) # # Histogram of timing intervals # - self.histogram_time_to_first_token = { - idx: - self._histogram_cls( - name="vllm:time_to_first_token_seconds", - documentation="Histogram of time to first token in seconds.", - buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, - 640.0, 2560.0 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_time_per_output_token = { - idx: - self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", - buckets=[ - 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 - ], - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_time_to_first_token = self._histogram_cls( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0 + ], + labelnames=labelnames) + self.histogram_time_to_first_token = make_per_engine( + histogram_time_to_first_token, engine_indexes, model_name) + + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name) request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = { - idx: - self._histogram_cls( - name="vllm:e2e_request_latency_seconds", - documentation="Histogram of e2e request latency in seconds.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_queue_time_request = { - idx: - self._histogram_cls( - name="vllm:request_queue_time_seconds", - documentation= - "Histogram of time spent in WAITING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_inference_time_request = { - idx: - self._histogram_cls( - name="vllm:request_inference_time_seconds", - documentation= - "Histogram of time spent in RUNNING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_prefill_time_request = { - idx: - self._histogram_cls( - name="vllm:request_prefill_time_seconds", - documentation= - "Histogram of time spent in PREFILL phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } - - self.histogram_decode_time_request = { - idx: - self._histogram_cls( - name="vllm:request_decode_time_seconds", - documentation= - "Histogram of time spent in DECODE phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(model_name, str(idx)) - for idx in self.engine_indexes - } + histogram_e2e_time_request = self._histogram_cls( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of e2e request latency in seconds.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_e2e_time_request = make_per_engine( + histogram_e2e_time_request, engine_indexes, model_name) + + histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_queue_time_request = make_per_engine( + histogram_queue_time_request, engine_indexes, model_name) + + histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_inference_time_request = make_per_engine( + histogram_inference_time_request, engine_indexes, model_name) + + histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_prefill_time_request = make_per_engine( + histogram_prefill_time_request, engine_indexes, model_name) + + histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_decode_time_request = make_per_engine( + histogram_decode_time_request, engine_indexes, model_name) # # # # LoRA metrics @@ -603,6 +557,18 @@ def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) +PromMetric = Union[ + prometheus_client.Gauge, + prometheus_client.Counter, + prometheus_client.Histogram, +] + + +def make_per_engine(metric: PromMetric, engine_idxs: list[int], + model_name: str) -> dict[int, PromMetric]: + return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs} + + def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by From cad9670547aa812b543541134365b1b9136b72e2 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 15:23:30 +0000 Subject: [PATCH 28/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 0288fe2f2197..6bb14d9c4e83 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -13,6 +13,7 @@ from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason +from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm @@ -151,7 +152,7 @@ class PrometheusStatLogger(StatLoggerBase): def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): - # unregister_vllm_metrics() + unregister_vllm_metrics() self.vllm_config = vllm_config # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future From fd0650f258be2dce978d70b970ce033770b1560b Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 15:56:50 +0000 Subject: [PATCH 29/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 22 ++++++++++++++++------ vllm/v1/metrics/loggers.py | 20 ++++++++++++-------- vllm/v1/metrics/ray_wrappers.py | 6 ++++-- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2758c59f49ee..477a4f8a9847 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -96,10 +96,17 @@ def __init__( self.log_stats = log_stats # Set up stat loggers; independent set for each DP rank. + # HACK: asyncllm should not be aware of how many engines is it + # managing. + start_idx = vllm_config.parallel_config.data_parallel_rank + local_engines = vllm_config.parallel_config.data_parallel_size_local + engine_idxs = [ + idx for idx in range(start_idx, start_idx + local_engines) + ] self.stat_loggers = setup_default_loggers( vllm_config=vllm_config, log_stats=self.log_stats, - engine_num=vllm_config.parallel_config.data_parallel_size, + engine_idxs=engine_idxs, custom_stat_loggers=stat_loggers, ) @@ -130,9 +137,11 @@ def __init__( client_index=client_index, ) if self.stat_loggers: - per_engine_loggers, _ = self.stat_loggers - for stat_logger in per_engine_loggers[0]: - stat_logger.log_engine_initialized() + # loggers, prom_logger + loggers, _ = self.stat_loggers + for per_engine_loggers in loggers.values(): + for logger in per_engine_loggers: + logger.log_engine_initialized() self.output_handler: Optional[asyncio.Task] = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -435,7 +444,8 @@ async def abort(self, request_id: str) -> None: @staticmethod def _record_stats( - stat_loggers: tuple[list[list[StatLoggerBase]], PrometheusStatLogger], + stat_loggers: tuple[dict[int, list[StatLoggerBase]], + PrometheusStatLogger], engine_idx: int, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats], @@ -559,7 +569,7 @@ async def do_log_stats( if self.stat_loggers is None: return per_engine_loggers, _ = self.stat_loggers - for loggers in per_engine_loggers: + for loggers in per_engine_loggers.values(): for stat_logger in loggers: stat_logger.log() diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6bb14d9c4e83..7a89d05fcbd3 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -150,7 +150,11 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): + def __init__(self, + vllm_config: VllmConfig, + engine_indexes: Optional[list[int]] = None): + if engine_indexes is None: + engine_indexes = [0] unregister_vllm_metrics() self.vllm_config = vllm_config @@ -162,7 +166,6 @@ def __init__(self, vllm_config: VllmConfig, engine_num: int = 1): labelnames = ["model_name", "engine"] model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - engine_indexes = list(range(engine_num)) # self.spec_decoding_prom = self._spec_decoding_cls( # vllm_config.speculative_config, labelnames, labelvalues) @@ -600,9 +603,9 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: def setup_default_loggers( vllm_config: VllmConfig, log_stats: bool, - engine_num: int, + engine_idxs: list[int], custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> Optional[tuple[list[list[StatLoggerBase]], PrometheusStatLogger]]: +) -> Optional[tuple[dict[int, list[StatLoggerBase]], PrometheusStatLogger]]: """Setup logging and prometheus metrics.""" if not log_stats: return None @@ -615,13 +618,14 @@ def setup_default_loggers( if logger.isEnabledFor(logging.INFO): factories.append(LoggingStatLogger) - stat_loggers: list[list[StatLoggerBase]] = [] - for engine_idx in range(engine_num): + # engine_idx: Logger + stat_loggers: dict[int, list[StatLoggerBase]] = {} + for engine_idx in engine_idxs: per_engine_stat_loggers: list[StatLoggerBase] = [] for logger_factory in factories: per_engine_stat_loggers.append( logger_factory(vllm_config, engine_idx)) - stat_loggers.append(per_engine_stat_loggers) + stat_loggers[engine_idx] = per_engine_stat_loggers - prom_stat_logger = PrometheusStatLogger(vllm_config, engine_num) + prom_stat_logger = PrometheusStatLogger(vllm_config, engine_idxs) return stat_loggers, prom_stat_logger diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index cce692d6c09e..73344937d89b 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -122,8 +122,10 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - super().__init__(vllm_config, engine_index) + def __init__(self, + vllm_config: VllmConfig, + engine_indexes: Optional[list[int]] = None): + super().__init__(vllm_config, engine_indexes) @staticmethod def _unregister_vllm_metrics(): From 896b0a271e0bb6cbc24ef22d31e4e11fd0fb5950 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 15:56:59 +0000 Subject: [PATCH 30/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 477a4f8a9847..8fcee8f92a57 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -568,6 +568,7 @@ async def do_log_stats( ) -> None: if self.stat_loggers is None: return + # loggers, prom_logger per_engine_loggers, _ = self.stat_loggers for loggers in per_engine_loggers.values(): for stat_logger in loggers: From 54e405bd925040b167637b20ae144df46c8b6b0f Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 16:45:46 +0000 Subject: [PATCH 31/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 46 +++++------------- vllm/v1/metrics/loggers.py | 94 ++++++++++++++++++++++++++++--------- 2 files changed, 82 insertions(+), 58 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 8fcee8f92a57..7c3186e1a699 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,10 +36,9 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, - StatLoggerFactory, setup_default_loggers) +from vllm.v1.metrics.loggers import StatLoggerFactory, setup_default_loggers from vllm.v1.metrics.prometheus import shutdown_prometheus -from vllm.v1.metrics.stats import IterationStats, SchedulerStats +from vllm.v1.metrics.stats import IterationStats logger = init_logger(__name__) @@ -103,7 +102,7 @@ def __init__( engine_idxs = [ idx for idx in range(start_idx, start_idx + local_engines) ] - self.stat_loggers = setup_default_loggers( + self.logger_manager = setup_default_loggers( vllm_config=vllm_config, log_stats=self.log_stats, engine_idxs=engine_idxs, @@ -136,12 +135,8 @@ def __init__( client_addresses=client_addresses, client_index=client_index, ) - if self.stat_loggers: - # loggers, prom_logger - loggers, _ = self.stat_loggers - for per_engine_loggers in loggers.values(): - for logger in per_engine_loggers: - logger.log_engine_initialized() + if self.logger_manager: + self.logger_manager.log_engine_initialized() self.output_handler: Optional[asyncio.Task] = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -380,7 +375,7 @@ def _run_output_handler(self): engine_core = self.engine_core output_processor = self.output_processor log_stats = self.log_stats - stat_loggers = self.stat_loggers if log_stats else None + logger_manager = self.logger_manager async def output_handler(): try: @@ -420,13 +415,14 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - if stat_loggers: - AsyncLLM._record_stats( - stat_loggers, - outputs.engine_index, + # NOTE: we do not use self.log + if logger_manager: + logger_manager.record( scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, + engine_idx=outputs.engine_index, ) + except Exception as e: logger.exception("AsyncLLM output_handler failed.") output_processor.propagate_error(e) @@ -442,26 +438,6 @@ async def abort(self, request_id: str) -> None: if self.log_requests: logger.info("Aborted request %s.", request_id) - @staticmethod - def _record_stats( - stat_loggers: tuple[dict[int, list[StatLoggerBase]], - PrometheusStatLogger], - engine_idx: int, - scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats], - ): - """static so that it can be used from the output_handler task - without a circular ref to AsyncLLM.""" - - per_engine_loggers, prom_logger = stat_loggers - for stat_logger in per_engine_loggers[engine_idx]: - stat_logger.record(engine_idx=engine_idx, - scheduler_stats=scheduler_stats, - iteration_stats=iteration_stats) - prom_logger.record(engine_idx=engine_idx, - scheduler_stats=scheduler_stats, - iteration_stats=iteration_stats) - async def encode( self, prompt: PromptType, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 7a89d05fcbd3..0c44127055a0 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -605,27 +605,75 @@ def setup_default_loggers( log_stats: bool, engine_idxs: list[int], custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> Optional[tuple[dict[int, list[StatLoggerBase]], PrometheusStatLogger]]: +) -> Optional["StatLoggerManager"]: """Setup logging and prometheus metrics.""" - if not log_stats: - return None - - factories: list[StatLoggerFactory] - if custom_stat_loggers is not None: - factories = custom_stat_loggers - else: - factories = [] - if logger.isEnabledFor(logging.INFO): - factories.append(LoggingStatLogger) - - # engine_idx: Logger - stat_loggers: dict[int, list[StatLoggerBase]] = {} - for engine_idx in engine_idxs: - per_engine_stat_loggers: list[StatLoggerBase] = [] - for logger_factory in factories: - per_engine_stat_loggers.append( - logger_factory(vllm_config, engine_idx)) - stat_loggers[engine_idx] = per_engine_stat_loggers - - prom_stat_logger = PrometheusStatLogger(vllm_config, engine_idxs) - return stat_loggers, prom_stat_logger + return (None if not log_stats else StatLoggerManager( + vllm_config, engine_idxs, custom_stat_loggers)) + + +class StatLoggerManager: + """ + StatLoggerManager: + Logging happens at the level of the EngineCore (per scheduler). + * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore. + * With Local Logger, just make N copies for N EngineCores. + * With Prometheus, we need a single logger with N "labels" + + This class abstracts away this implementation detail from + the AsyncLLM, allowing the AsyncLLM to just call .record() + and .log() to a simple interface. + """ + + def __init__( + self, + vllm_config: VllmConfig, + engine_idxs: Optional[list[int]] = None, + custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, + ): + self.engine_idxs = set([0]) if not engine_idxs else set(engine_idxs) + + factories: list[StatLoggerFactory] + if custom_stat_loggers is not None: + factories = custom_stat_loggers + else: + factories = [] + if logger.isEnabledFor(logging.INFO): + factories.append(LoggingStatLogger) + + # engine_idx: StatLogger + self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} + for engine_idx in self.engine_idxs: + loggers: list[StatLoggerBase] = [] + for logger_factory in factories: + loggers.append(logger_factory(vllm_config, engine_idx)) + self.per_engine_logger_dict[engine_idx] = loggers + + # For Prometheus, need to share the metrics between EngineCores. + # Each EngineCore's metrics are expressed as a unique label. + self.prometheus_logger = PrometheusStatLogger(vllm_config, engine_idxs) + + def record( + self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: Optional[int] = None, + ): + if engine_idx is None: + engine_idx = 0 + + per_engine_loggers = self.per_engine_logger_dict[engine_idx] + for logger in per_engine_loggers: + logger.record(scheduler_stats, iteration_stats, engine_idx) + + self.prometheus_logger.record(scheduler_stats, iteration_stats, + engine_idx) + + def log(self): + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log() + + def log_engine_initialized(self): + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log_engine_initialized() From 02ecfa80a499b930d4518e9ff405178575074b5c Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 16:46:35 +0000 Subject: [PATCH 32/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 0c44127055a0..6de65280364d 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -630,7 +630,7 @@ def __init__( engine_idxs: Optional[list[int]] = None, custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, ): - self.engine_idxs = set([0]) if not engine_idxs else set(engine_idxs) + self.engine_idxs = engine_idxs if engine_idxs else [0] factories: list[StatLoggerFactory] if custom_stat_loggers is not None: From 1358836fa0b84000bf57f33da896086d051254a4 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 16:48:31 +0000 Subject: [PATCH 33/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7c3186e1a699..d6ffbba32342 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -422,7 +422,6 @@ async def output_handler(): iteration_stats=iteration_stats, engine_idx=outputs.engine_index, ) - except Exception as e: logger.exception("AsyncLLM output_handler failed.") output_processor.propagate_error(e) From 4eae5cbeeabbb02a078ec46ab62f524eab89aa63 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 16:50:36 +0000 Subject: [PATCH 34/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 7 +++---- vllm/v1/metrics/loggers.py | 11 ----------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d6ffbba32342..b8a40f77df31 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,7 +36,7 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import StatLoggerFactory, setup_default_loggers +from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager from vllm.v1.metrics.prometheus import shutdown_prometheus from vllm.v1.metrics.stats import IterationStats @@ -102,12 +102,11 @@ def __init__( engine_idxs = [ idx for idx in range(start_idx, start_idx + local_engines) ] - self.logger_manager = setup_default_loggers( + self.logger_manager = StatLoggerManager( vllm_config=vllm_config, - log_stats=self.log_stats, engine_idxs=engine_idxs, custom_stat_loggers=stat_loggers, - ) + ) if self.log_stats else None # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6de65280364d..b6b8b6d0a4d8 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -600,17 +600,6 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: return build_buckets([1, 2, 5], max_value) -def setup_default_loggers( - vllm_config: VllmConfig, - log_stats: bool, - engine_idxs: list[int], - custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> Optional["StatLoggerManager"]: - """Setup logging and prometheus metrics.""" - return (None if not log_stats else StatLoggerManager( - vllm_config, engine_idxs, custom_stat_loggers)) - - class StatLoggerManager: """ StatLoggerManager: From c08fb6d4561f4754d0f5bc6abd9ddb8d5265f017 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 16:57:45 +0000 Subject: [PATCH 35/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 43 ++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index b6b8b6d0a4d8..d8d9eb463f4e 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -155,6 +155,7 @@ def __init__(self, engine_indexes: Optional[list[int]] = None): if engine_indexes is None: engine_indexes = [0] + self.engine_indexes = engine_indexes unregister_vllm_metrics() self.vllm_config = vllm_config @@ -449,26 +450,28 @@ def __init__(self, # ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): - pass - # metrics_info = config_obj.metrics_info() - # metrics_info["engine"] = self.engine_index - - # name, documentation = None, None - # if type == "cache_config": - # name = "vllm:cache_config_info" - # documentation = "Information of the LLMEngine CacheConfig" - # assert name is not None, f"Unknown metrics info type {type}" - - # # Info type metrics are syntactic sugar for a gauge permanently set to 1 # noqa: E501 - # # Since prometheus multiprocessing mode does not support Info, emulate - # # info here with a gauge. - # info_gauge = self._gauge_cls( - # name=name, - # documentation=documentation, - # multiprocess_mode="mostrecent", - # labelnames=metrics_info.keys(), - # ).labels(**metrics_info) - # info_gauge.set(1) + metrics_info = config_obj.metrics_info() + metrics_info["engine"] = "" + + name, documentation = None, None + if type == "cache_config": + name = "vllm:cache_config_info" + documentation = "Information of the LLMEngine CacheConfig" + assert name is not None, f"Unknown metrics info type {type}" + + # Info type metrics are syntactic sugar for a gauge permanently set to 1 # noqa: E501 + # Since prometheus multiprocessing mode does not support Info, emulate + # info here with a gauge. + info_gauge = self._gauge_cls( + name=name, + documentation=documentation, + multiprocess_mode="mostrecent", + labelnames=metrics_info.keys(), + ) + for engine_index in self.engine_indexes: + metrics_info = config_obj.metrics_info() + metrics_info["engine"] = str(engine_index) + info_gauge.labels(*metrics_info.set(1)) def record(self, scheduler_stats: Optional[SchedulerStats], From d9291f998ed46cf6b4af833bbcdd5e14bd2d123d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:04:35 +0000 Subject: [PATCH 36/51] cleanup Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 80 +++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index d8d9eb463f4e..6aea6827e9a5 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -425,29 +425,32 @@ def __init__(self, self.histogram_decode_time_request = make_per_engine( histogram_decode_time_request, engine_indexes, model_name) - # # - # # LoRA metrics - # # - - # # TODO: This metric might be incorrect in case of using multiple - # # api_server counts which uses prometheus mp. - # self.gauge_lora_info: Optional[prometheus_client.Gauge] = None - # if vllm_config.lora_config is not None: - # self.labelname_max_lora = "max_lora" - # self.labelname_waiting_lora_adapters = "waiting_lora_adapters" - # self.labelname_running_lora_adapters = "running_lora_adapters" - # self.max_lora = vllm_config.lora_config.max_loras - # self.gauge_lora_info = \ - # self._gauge_cls( - # name="vllm:lora_requests_info", - # documentation="Running stats on lora requests.", - # multiprocess_mode="sum", - # labelnames=[ - # self.labelname_max_lora, - # self.labelname_waiting_lora_adapters, - # self.labelname_running_lora_adapters, - # ], - # ) + # + # LoRA metrics + # + + # TODO: This metric might be incorrect in case of using multiple + # api_server counts which uses prometheus mp. + self.gauge_lora_info: Optional[prometheus_client.Gauge] = None + if vllm_config.lora_config is not None: + if len(self.engine_indexes) > 1: + raise NotImplementedError( + "LoRA in DP mode is not supported yet.") + self.labelname_max_lora = "max_lora" + self.labelname_waiting_lora_adapters = "waiting_lora_adapters" + self.labelname_running_lora_adapters = "running_lora_adapters" + self.max_lora = vllm_config.lora_config.max_loras + self.gauge_lora_info = \ + self._gauge_cls( + name="vllm:lora_requests_info", + documentation="Running stats on lora requests.", + multiprocess_mode="sum", + labelnames=[ + self.labelname_max_lora, + self.labelname_waiting_lora_adapters, + self.labelname_running_lora_adapters, + ], + ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): metrics_info = config_obj.metrics_info() @@ -471,7 +474,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): for engine_index in self.engine_indexes: metrics_info = config_obj.metrics_info() metrics_info["engine"] = str(engine_index) - info_gauge.labels(*metrics_info.set(1)) + info_gauge.labels(**metrics_info).set(1) def record(self, scheduler_stats: Optional[SchedulerStats], @@ -547,18 +550,21 @@ def record(self, self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) - # if self.gauge_lora_info is not None: - # running_lora_adapters = \ - # ",".join(iteration_stats.running_lora_adapters.keys()) - # waiting_lora_adapters = \ - # ",".join(iteration_stats.waiting_lora_adapters.keys()) - # lora_info_labels = { - # self.labelname_running_lora_adapters: running_lora_adapters, - # self.labelname_waiting_lora_adapters: waiting_lora_adapters, - # self.labelname_max_lora: self.max_lora, - # } - # self.gauge_lora_info.labels(**lora_info_labels)\ - # .set_to_current_time() + # TODO: investigate whether LoRA works with DP and ensure + # that the metrics we are logging are consistent with + + if self.gauge_lora_info is not None: + running_lora_adapters = \ + ",".join(iteration_stats.running_lora_adapters.keys()) + waiting_lora_adapters = \ + ",".join(iteration_stats.waiting_lora_adapters.keys()) + lora_info_labels = { + self.labelname_running_lora_adapters: running_lora_adapters, + self.labelname_waiting_lora_adapters: waiting_lora_adapters, + self.labelname_max_lora: self.max_lora, + } + self.gauge_lora_info.labels(**lora_info_labels)\ + .set_to_current_time() def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) @@ -666,6 +672,8 @@ def log(self): logger.log() def log_engine_initialized(self): + self.prometheus_logger.log_engine_initialized() + for per_engine_loggers in self.per_engine_logger_dict.values(): for logger in per_engine_loggers: logger.log_engine_initialized() From 876c864d3cbe12b181fb1d1e4710edebe595ad6a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:12:27 +0000 Subject: [PATCH 37/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6aea6827e9a5..2a76e9234f5a 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -168,8 +168,17 @@ def __init__(self, model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - # self.spec_decoding_prom = self._spec_decoding_cls( - # vllm_config.speculative_config, labelnames, labelvalues) + if len(self.engine_indexes) > 1: + raise NotImplementedError("Prometheus metrics with Spec Decoding " + "with >1 EngineCore per AsyncLLM is not " + "supported yet.") + spec_decode_labelvalues = [ + vllm_config.model_config.served_model_name, + str(self.engine_indexes[0]) + ] + self.spec_decoding_prom = self._spec_decoding_cls( + vllm_config.speculative_config, labelnames, + spec_decode_labelvalues) # # Scheduler state From f477b504934eb04b3efac0af4660c1519ec41af3 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:15:20 +0000 Subject: [PATCH 38/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/ray_wrappers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 73344937d89b..dd21ce85da71 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -3,7 +3,6 @@ import time from typing import Optional, Union -from vllm.config import VllmConfig from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.spec_decode.metrics import SpecDecodingProm @@ -122,11 +121,6 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm - def __init__(self, - vllm_config: VllmConfig, - engine_indexes: Optional[list[int]] = None): - super().__init__(vllm_config, engine_indexes) - @staticmethod def _unregister_vllm_metrics(): # No-op on purpose From 5ea4fa206d8e08d184174ac3d465975a5cb01489 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:17:15 +0000 Subject: [PATCH 39/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 2a76e9234f5a..d4de767855d3 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -168,7 +168,8 @@ def __init__(self, model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - if len(self.engine_indexes) > 1: + if (len(self.engine_indexes) > 1 + and vllm_config.speculative_config is not None): raise NotImplementedError("Prometheus metrics with Spec Decoding " "with >1 EngineCore per AsyncLLM is not " "supported yet.") @@ -511,9 +512,9 @@ def record(self, self.counter_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) - # if scheduler_stats.spec_decoding_stats is not None: - # self.spec_decoding_prom.observe( - # scheduler_stats.spec_decoding_stats) + if scheduler_stats.spec_decoding_stats is not None: + self.spec_decoding_prom.observe( + scheduler_stats.spec_decoding_stats) if iteration_stats is None: return From e9e180da5fdebeb00d56ac6e2da33331808571d1 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:19:13 +0000 Subject: [PATCH 40/51] cleanup Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index d4de767855d3..0fbbb819291e 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -472,7 +472,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): documentation = "Information of the LLMEngine CacheConfig" assert name is not None, f"Unknown metrics info type {type}" - # Info type metrics are syntactic sugar for a gauge permanently set to 1 # noqa: E501 + # Info type metrics are syntactic sugar for a gauge permanently set to 1 # Since prometheus multiprocessing mode does not support Info, emulate # info here with a gauge. info_gauge = self._gauge_cls( From 3f4ae353c244b2b3eef0574804b0f77351a3f8ea Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:19:42 +0000 Subject: [PATCH 41/51] updated Signed-off-by: Robert Shaw --- vllm/v1/metrics/loggers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 0fbbb819291e..1e13513674ad 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -560,9 +560,6 @@ def record(self, self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) - # TODO: investigate whether LoRA works with DP and ensure - # that the metrics we are logging are consistent with - if self.gauge_lora_info is not None: running_lora_adapters = \ ",".join(iteration_stats.running_lora_adapters.keys()) From 7b53f0e7db273aa8d2baf12e5a93c9041e91defd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:22:15 +0000 Subject: [PATCH 42/51] revert arg utils change Signed-off-by: Robert Shaw --- vllm/engine/arg_utils.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c3262d6e277e..019ff033eda2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1091,15 +1091,13 @@ def create_engine_config( # but we should not do this here. placement_group = ray.util.get_current_placement_group() - # data_parallel_external_lb = self.data_parallel_rank is not None - # if data_parallel_external_lb: - # assert self.data_parallel_size_local in (1, None), ( - # "data_parallel_size_local must be 1 when data_parallel_rank " - # "is set") - # data_parallel_size_local = 1 - # elif self.data_parallel_size_local is not None: - data_parallel_external_lb = False - if self.data_parallel_size_local is not None: + data_parallel_external_lb = self.data_parallel_rank is not None + if data_parallel_external_lb: + assert self.data_parallel_size_local in (1, None), ( + "data_parallel_size_local must be 1 when data_parallel_rank " + "is set") + data_parallel_size_local = 1 + elif self.data_parallel_size_local is not None: data_parallel_size_local = self.data_parallel_size_local else: # Local DP size defaults to global DP size if not set. @@ -1360,10 +1358,10 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: and not envs.is_set("VLLM_ATTENTION_BACKEND") ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" supported = False - if current_platform.is_rocm() or ( - current_platform.is_cuda() - and current_platform.is_device_capability(100) - ): # handle hpu also for OOT platform + if (current_platform.is_rocm() + or (current_platform.is_cuda() + and current_platform.is_device_capability(100)) + or current_platform.is_tpu()): supported = True elif fp8_attention and will_use_fa: from vllm.attention.utils.fa_utils import ( From 753061b0c0a9187bdff02b875ac9a85d5276cc49 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:24:09 +0000 Subject: [PATCH 43/51] reset Signed-off-by: Robert Shaw --- vllm/v1/engine/core_client.py | 11 +++-------- vllm/v1/engine/utils.py | 17 ++++------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 13d0ee19d16b..82fc1fa9937c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -429,12 +429,11 @@ def __init__( parallel_config = vllm_config.parallel_config dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank - dp_local_size = parallel_config.data_parallel_size_local external_dp_lb = parallel_config.data_parallel_external_lb offline_mode = parallel_config.data_parallel_rank_local is not None - engine_ranks = ([dp_rank] if (offline_mode or external_dp_lb) - else range(dp_rank, dp_rank + dp_local_size)) + engine_ranks = [dp_rank] if (offline_mode + or external_dp_lb) else range(dp_size) assert parallel_config.data_parallel_size_local <= len( engine_ranks) @@ -895,8 +894,6 @@ def _ensure_stats_update_task(self): return assert self.stats_update_address is not None - dp_start_rank = self.vllm_config.parallel_config.data_parallel_rank - dp_end_rank = dp_start_rank + self.vllm_config.parallel_config.data_parallel_size_local async def run_engine_stats_update_task(): with make_zmq_socket(self.ctx, self.stats_update_address, @@ -961,9 +958,7 @@ async def run_engine_stats_update_task(): counts, wave, running = msgspec.msgpack.decode(buf) self.current_wave = wave self.engines_running = running - # NOTE: counts includes num running for all global - # EngineCores, so need to slide for the local ones. - self.lb_engines = counts[dp_start_rank:dp_end_rank] + self.lb_engines = counts resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index ac3acafbe01d..6dde477576b8 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -555,8 +555,6 @@ def launch_core_engines( # sends requests only to colocated engines. client_local_only = offline_mode or external_dp_lb or (local_engine_count == dp_size) - # HACK: handle case with one pod per node. - client_local_only = True # Set up input and output addresses. addresses = EngineZmqAddresses( @@ -603,17 +601,11 @@ def launch_core_engines( if offline_mode or (external_dp_lb and dp_rank > 0): assert local_engine_count == 1 engines_to_handshake = [CoreEngine(index=dp_rank, local=True)] - elif dp_rank == 0: + else: engines_to_handshake = [ CoreEngine(index=i, local=(i < local_engine_count)) for i in range(dp_size) ] - else: - # Just handshake with local engines. - engines_to_handshake = [ - CoreEngine(index=i, local=True) for i in - range(dp_rank, dp_rank + local_engine_count) - ] # Whether the started engines will handshake only with co-located # front-end processes. In external_dp_lb mode, ranks > 0 handshake with @@ -624,8 +616,7 @@ def launch_core_engines( handshake_address = get_engine_client_zmq_addr( handshake_local_only, host, parallel_config.data_parallel_rpc_port) - # if external_dp_lb and dp_rank > 0: - if dp_rank > 0: + if external_dp_lb and dp_rank > 0: assert not handshake_local_only local_handshake_address = get_open_zmq_ipc_path() client_handshake_address = local_handshake_address @@ -640,8 +631,8 @@ def launch_core_engines( # Start local engines. if local_engine_count: - # In server mode, start_index is the dp_rank and - # local_start_index is 0. + # In server mode, start_index and local_start_index will + # both be 0. local_engine_manager = CoreEngineProcManager( EngineCoreProc.run_engine_core, vllm_config=vllm_config, From 381d7a6ff22900e05b01bcf9c5c257b7a2b36344 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:32:12 +0000 Subject: [PATCH 44/51] add comment Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index b8a40f77df31..1da4a67559d5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -650,18 +650,16 @@ async def scale_elastic_ep(self, new_data_parallel_size # recreate stat loggers - # if new_data_parallel_size > old_data_parallel_size: - # stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( - # vllm_config=self.vllm_config, - # log_stats=self.log_stats, - # engine_num=new_data_parallel_size, - # custom_stat_loggers=None, - # ) - # num_new_engines = len(stat_loggers) - len(self.stat_loggers) - # self.stat_loggers.extend(stat_loggers[-num_new_engines:]) - # else: - # for _ in range(old_data_parallel_size - new_data_parallel_size): - # self.stat_loggers.pop() + if new_data_parallel_size > old_data_parallel_size and self.log_stats: + # TODO(rob): fix this after talking with Ray team. + # This resets all the prometheus metrics since we + # unregister during initialization. Need to understand + # the intended behavior here better. + self.logger_manager = StatLoggerManager( + vllm_config=self.vllm_config, + engine_idxs=list(range(new_data_parallel_size)), + custom_stat_loggers=None, + ) @property def is_running(self) -> bool: From d2baf53085b24745e926073c5c5786b86a25f13c Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:46:26 +0000 Subject: [PATCH 45/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 26 ++++++++++---------------- vllm/v1/engine/core_client.py | 9 +++++---- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1da4a67559d5..4ddf9139cbc4 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -94,20 +94,6 @@ def __init__( self.log_requests = log_requests self.log_stats = log_stats - # Set up stat loggers; independent set for each DP rank. - # HACK: asyncllm should not be aware of how many engines is it - # managing. - start_idx = vllm_config.parallel_config.data_parallel_rank - local_engines = vllm_config.parallel_config.data_parallel_size_local - engine_idxs = [ - idx for idx in range(start_idx, start_idx + local_engines) - ] - self.logger_manager = StatLoggerManager( - vllm_config=vllm_config, - engine_idxs=engine_idxs, - custom_stat_loggers=stat_loggers, - ) if self.log_stats else None - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -126,7 +112,6 @@ def __init__( log_stats=self.log_stats) # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_async_mp_client( vllm_config=vllm_config, executor_class=executor_class, @@ -134,8 +119,17 @@ def __init__( client_addresses=client_addresses, client_index=client_index, ) - if self.logger_manager: + + # Loggers. + self.logger_manager: Optional[StatLoggerManager] = None + if self.log_stats: + self.logger_manager = StatLoggerManager( + vllm_config=vllm_config, + engine_idxs=self.engine_core.engine_ranks, + custom_stat_loggers=stat_loggers, + ) self.logger_manager.log_engine_initialized() + self.output_handler: Optional[asyncio.Task] = None try: # Start output handler eagerly if we are in the asyncio eventloop. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 82fc1fa9937c..2ebb76a97ebe 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -432,14 +432,15 @@ def __init__( external_dp_lb = parallel_config.data_parallel_external_lb offline_mode = parallel_config.data_parallel_rank_local is not None - engine_ranks = [dp_rank] if (offline_mode - or external_dp_lb) else range(dp_size) + self.engine_ranks = ([dp_rank] if + (offline_mode or external_dp_lb) else list( + range(dp_size))) assert parallel_config.data_parallel_size_local <= len( - engine_ranks) + self.engine_ranks) # ZMQ identity of each engine that this client will talk to. self.core_engines: list[EngineIdentity] = [ - index.to_bytes(2, "little") for index in engine_ranks + index.to_bytes(2, "little") for index in self.engine_ranks ] # Wait for ready messages from each engine on the input socket. From ebbc432813ec997e8c7f56acf218766ff33cf54a Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:48:07 +0000 Subject: [PATCH 46/51] updated Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4ddf9139cbc4..fdd89f8cc4c5 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -534,13 +534,8 @@ async def do_log_stats( scheduler_outputs=None, model_output=None, ) -> None: - if self.stat_loggers is None: - return - # loggers, prom_logger - per_engine_loggers, _ = self.stat_loggers - for loggers in per_engine_loggers.values(): - for stat_logger in loggers: - stat_logger.log() + if self.logger_manager: + self.logger_manager.log() async def check_health(self) -> None: logger.debug("Called check_health.") From 4b50833d3c31a58798158087104509866e6acefe Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 17:52:18 +0000 Subject: [PATCH 47/51] cleanup Signed-off-by: Robert Shaw --- vllm/v1/engine/async_llm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fdd89f8cc4c5..b8ba36f3502f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -408,12 +408,11 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - # NOTE: we do not use self.log if logger_manager: logger_manager.record( + engine_idx=outputs.engine_index, scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, - engine_idx=outputs.engine_index, ) except Exception as e: logger.exception("AsyncLLM output_handler failed.") From eb5b84e14fd0d9a1e90ba3b55112214a78b57c44 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sun, 20 Jul 2025 18:47:08 +0000 Subject: [PATCH 48/51] stash Signed-off-by: Robert Shaw --- tests/v1/metrics/test_ray_metrics.py | 1 + vllm/v1/metrics/loggers.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index 0898ae65e7cd..be9e1417e672 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -41,6 +41,7 @@ async def run(self): model=model, dtype=dtype, disable_log_stats=False, + enforce_eager=True, ) engine = AsyncLLM.from_engine_args( diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 1e13513674ad..e878f51b90ad 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -647,15 +647,24 @@ def __init__( # engine_idx: StatLogger self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} + prometheus_factory = PrometheusStatLogger for engine_idx in self.engine_idxs: loggers: list[StatLoggerBase] = [] for logger_factory in factories: - loggers.append(logger_factory(vllm_config, engine_idx)) + # If we get a custom prometheus logger, use that + # instead. This is typically used for the ray case. + if (isinstance(logger_factory, type) + and issubclass(logger_factory, PrometheusStatLogger)): + prometheus_factory = logger_factory + continue + loggers.append(logger_factory(vllm_config, + engine_idx)) # type: ignore self.per_engine_logger_dict[engine_idx] = loggers # For Prometheus, need to share the metrics between EngineCores. # Each EngineCore's metrics are expressed as a unique label. - self.prometheus_logger = PrometheusStatLogger(vllm_config, engine_idxs) + print(f"{prometheus_factory=}") + self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs) def record( self, From c54c17eebf426c2deafa051877a43b6b0d869f36 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 21 Jul 2025 00:53:07 +0000 Subject: [PATCH 49/51] fixing tests Signed-off-by: Robert Shaw --- tests/v1/engine/test_async_llm.py | 7 ++++--- vllm/v1/metrics/loggers.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index e137452f2625..cf334fd34e39 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -336,9 +336,10 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - assert len(engine.stat_loggers) == 1 - assert len(engine.stat_loggers[0]) == 1 - engine.stat_loggers[0][0].log.assert_called_once() + stat_loggers = engine.logger_manager.per_engine_logger_dict[0] + assert len(stat_loggers) == 1 + assert len(stat_loggers) == 1 + stat_loggers.log.assert_called_once() @pytest.mark.asyncio(scope="module") diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index e878f51b90ad..7f2556bab5a4 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -663,7 +663,6 @@ def __init__( # For Prometheus, need to share the metrics between EngineCores. # Each EngineCore's metrics are expressed as a unique label. - print(f"{prometheus_factory=}") self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs) def record( From 4be985d3b1b528dec2d6ce000cf86a989e1214e6 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 21 Jul 2025 00:55:52 +0000 Subject: [PATCH 50/51] passing Signed-off-by: Robert Shaw --- tests/v1/engine/test_async_llm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index cf334fd34e39..412df3acff12 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -336,10 +336,10 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - stat_loggers = engine.logger_manager.per_engine_logger_dict[0] + stat_loggers = engine.logger_manager.per_engine_logger_dict assert len(stat_loggers) == 1 - assert len(stat_loggers) == 1 - stat_loggers.log.assert_called_once() + assert len(stat_loggers[0]) == 1 + stat_loggers[0][0].log.assert_called_once() @pytest.mark.asyncio(scope="module") From 20e7f1742be017dd91df7b9b7927b66fc3a45027 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Mon, 21 Jul 2025 00:59:43 +0000 Subject: [PATCH 51/51] get other failing tst to pass Signed-off-by: Robert Shaw --- tests/v1/test_async_llm_dp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 64a41bec3791..6716d27f571f 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -90,8 +90,10 @@ class SimpleStatsLogger(StatLoggerBase): def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): stats_loggers[engine_index] = self - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): if iteration_stats: self.finished_req_count += len( iteration_stats.finished_requests)