diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 948ce9e8667f..5422518588bf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -98,25 +98,6 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-- label: Chunked Prefill Test
-  mirror_hardwares: [amdexperimental, amdproduction]
-  source_file_dependencies:
-  - vllm/
-  - tests/basic_correctness/test_chunked_prefill
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
-- label: Core Test # 10min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  fast_check: true
-  source_file_dependencies:
-  - vllm/core
-  - vllm/distributed
-  - tests/core
-  commands:
-  - pytest -v -s core
-
 - label: Entrypoints Test (LLM) # 40min
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
@@ -155,7 +136,6 @@ steps:
   num_gpus: 4
   source_file_dependencies:
   - vllm/distributed/
-  - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
@@ -170,7 +150,6 @@ steps:
   - tests/v1/engine/test_engine_core_client.py
   commands:
   # test with tp=2 and external_dp=2
-  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with tp=2 and pp=2
   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
@@ -209,15 +188,13 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
 
-- label: Metrics, Tracing Test # 10min
+- label: Tracing Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
-  - tests/metrics
   - tests/tracing
   commands:
-  - pytest -v -s metrics
   - "pip install \
       'opentelemetry-sdk>=1.26.0' \
       'opentelemetry-api>=1.26.0' \
@@ -305,15 +282,6 @@ steps:
     - python3 offline_inference/basic/score.py
     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-- label: Prefix Caching Test # 9min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  source_file_dependencies:
-  - vllm/
-  - tests/prefix_caching
-  commands:
-    - pytest -v -s prefix_caching
-
-
 - label: Platform Tests (CUDA)
   mirror_hardwares: [amdexperimental, amdproduction]
   source_file_dependencies:
@@ -322,17 +290,6 @@ steps:
   commands:
     - pytest -v -s cuda/test_cuda_context.py
 
-- label: Samplers Test # 36min
-  mirror_hardwares: [amdexperimental]
-  source_file_dependencies:
-  - vllm/model_executor/layers
-  - vllm/sampling_metadata.py
-  - tests/samplers
-  - tests/conftest.py
-  commands:
-    - pytest -v -s samplers
-    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
 - label: LoRA Test %N # 15min each
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
@@ -493,13 +450,13 @@ steps:
   commands: # LMEval+Transcription WER check
   - pytest -s entrypoints/openai/correctness/
 
-- label: Encoder Decoder tests # 5min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  source_file_dependencies:
-  - vllm/
-  - tests/encoder_decoder
-  commands:
-    - pytest -v -s encoder_decoder
+# - label: Encoder Decoder tests # 5min
+#   mirror_hardwares: [amdexperimental, amdproduction]
+#   source_file_dependencies:
+#   - vllm/
+#   - tests/encoder_decoder
+#   commands:
+#     - pytest -v -s encoder_decoder
 
 - label: OpenAI-Compatible Tool Use # 20 min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -664,7 +621,6 @@ steps:
   num_nodes: 2
   source_file_dependencies:
   - vllm/distributed/
-  - vllm/engine/
   - vllm/executor/
   - vllm/model_executor/models/
   - tests/distributed/
@@ -687,14 +643,10 @@ steps:
   num_gpus: 2
   source_file_dependencies:
   - vllm/distributed/
-  - vllm/engine/
   - vllm/executor/
   - vllm/model_executor/models/
   - tests/distributed/
   - vllm/compilation
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   - tests/v1/test_async_llm_dp.py
   - tests/v1/test_external_lb_dp.py
@@ -743,34 +695,12 @@ steps:
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 
-- label: Multi-step Tests (4 GPUs) # 36min
-  mirror_hardwares: [amdexperimental, amdproduction]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  source_file_dependencies:
-  - vllm/model_executor/layers/sampler.py
-  - vllm/sequence.py
-  - vllm/worker/worker_base.py
-  - vllm/worker/worker.py
-  - vllm/worker/multi_step_worker.py
-  - vllm/worker/model_runner_base.py
-  - vllm/worker/model_runner.py
-  - vllm/worker/multi_step_model_runner.py
-  - vllm/engine
-  - tests/multi_step
-  commands:
-  # this test is quite flaky
-  # TODO: investigate and fix.
-  # - pytest -v -s multi_step/test_correctness_async_llm.py
-  - pytest -v -s multi_step/test_correctness_llm.py
-
 - label: Pipeline Parallelism Test # 45min
   mirror_hardwares: [amdexperimental, amdproduction]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
   - vllm/distributed/
-  - vllm/engine/
   - vllm/executor/
   - vllm/model_executor/models/
   - tests/distributed/
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 244105537166..f384ec19a46f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,12 +3,7 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 /vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 /vllm/multimodal @DarkLight1337 @ywang96
diff --git a/pyproject.toml b/pyproject.toml
index a65267942d47..e8d03b8fcb7b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,10 +69,7 @@ line-length = 80
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing - skip V0 code
 "vllm/attention/**/*.py" = ["UP006", "UP035"]
-"vllm/core/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
-"vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/worker/**/*.py" = ["UP006", "UP035"]
 # Python 3.8 typing - skip utils for ROCm
 "vllm/utils/__init__.py" = ["UP006", "UP035"]
 
@@ -119,7 +116,6 @@ files = [
     "vllm/adapter_commons",
     "vllm/assets",
     "vllm/entrypoints",
-    "vllm/core",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
diff --git a/tests/async_engine/__init__.py b/tests/async_engine/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
deleted file mode 100644
index ec6b20f5e04b..000000000000
--- a/tests/async_engine/api_server_async_engine.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""vllm.entrypoints.api_server with some extra logging for testing."""
-from collections.abc import Iterable
-from typing import Any
-
-import uvicorn
-from fastapi.responses import JSONResponse, Response
-
-import vllm.entrypoints.api_server
-import vllm.envs as envs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.utils import FlexibleArgumentParser
-
-app = vllm.entrypoints.api_server.app
-
-
-class AsyncLLMEngineWithStats(AsyncLLMEngine):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._num_aborts = 0
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        ids = list(request_ids)
-        self._num_aborts += len(ids)
-        await super()._engine_abort(ids)
-
-    def testing_stats(self) -> dict[str, Any]:
-        return {"num_aborted_requests": self._num_aborts}
-
-
-@app.get("/stats")
-def stats() -> Response:
-    """Get the statistics of the engine."""
-    return JSONResponse(engine.testing_stats())
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8000)
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
-
-    engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
-    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(app,
-                host=args.host,
-                port=args.port,
-                log_level="debug",
-                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/async_engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
deleted file mode 100644
index 76c94bdf80ca..000000000000
--- a/tests/async_engine/test_api_server.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import subprocess
-import sys
-import time
-from multiprocessing import Pool
-from pathlib import Path
-
-import pytest
-import requests
-
-
-def _query_server(prompt: str, max_tokens: int = 5) -> dict:
-    response = requests.post("http://localhost:8000/generate",
-                             json={
-                                 "prompt": prompt,
-                                 "max_tokens": max_tokens,
-                                 "temperature": 0,
-                                 "ignore_eos": True
-                             })
-    response.raise_for_status()
-    return response.json()
-
-
-def _query_server_long(prompt: str) -> dict:
-    return _query_server(prompt, max_tokens=500)
-
-
-@pytest.fixture
-def api_server(distributed_executor_backend: str):
-    script_path = Path(__file__).parent.joinpath(
-        "api_server_async_engine.py").absolute()
-    commands = [
-        sys.executable,
-        "-u",
-        str(script_path),
-        "--model",
-        "facebook/opt-125m",
-        "--host",
-        "127.0.0.1",
-        "--distributed-executor-backend",
-        distributed_executor_backend,
-    ]
-
-    # API Server Test Requires V0.
-    my_env = os.environ.copy()
-    my_env["VLLM_USE_V1"] = "0"
-    uvicorn_process = subprocess.Popen(commands, env=my_env)
-    yield
-    uvicorn_process.terminate()
-
-
-@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
-def test_api_server(api_server, distributed_executor_backend: str):
-    """
-    Run the API server and test it.
-
-    We run both the server and requests in separate processes.
-
-    We test that the server can handle incoming requests, including
-    multiple requests at the same time, and that it can handle requests
-    being cancelled without crashing.
-    """
-    with Pool(32) as pool:
-        # Wait until the server is ready
-        prompts = ["warm up"] * 1
-        result = None
-        while not result:
-            try:
-                for r in pool.map(_query_server, prompts):
-                    result = r
-                    break
-            except requests.exceptions.ConnectionError:
-                time.sleep(1)
-
-        # Actual tests start here
-        # Try with 1 prompt
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests == 0
-
-        # Try with 100 prompts
-        prompts = ["test prompt"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
-
-    with Pool(32) as pool:
-        # Cancel requests
-        prompts = ["canceled requests"] * 100
-        pool.map_async(_query_server_long, prompts)
-        time.sleep(0.01)
-        pool.terminate()
-        pool.join()
-
-        # check cancellation stats
-        # give it some times to update the stats
-        time.sleep(1)
-
-        num_aborted_requests = requests.get(
-            "http://localhost:8000/stats").json()["num_aborted_requests"]
-        assert num_aborted_requests > 0
-
-    # check that server still runs after cancellations
-    with Pool(32) as pool:
-        # Try with 100 prompts
-        prompts = ["test prompt after canceled"] * 100
-        for result in pool.map(_query_server, prompts):
-            assert result
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
deleted file mode 100644
index 0eb7a6eb52aa..000000000000
--- a/tests/async_engine/test_async_llm_engine.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-import uuid
-from asyncio import CancelledError
-from copy import copy
-from dataclasses import dataclass, field
-from typing import Any, Optional
-
-import pytest
-import pytest_asyncio
-import torch
-
-from vllm import SamplingParams
-from vllm.config import ParallelConfig
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
-from vllm.outputs import RequestOutput as RealRequestOutput
-from vllm.sampling_params import RequestOutputKind
-
-from ..utils import wait_for_gpu_memory_to_clear
-
-
-@dataclass
-class RequestOutput:
-    request_id: int
-    finished: bool = False
-
-
-@dataclass
-class MockModelConfig:
-    use_async_output_proc = True
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-
-
-class MockEngine:
-
-    def __init__(self):
-        self.step_calls = 0
-        self.add_request_calls = 0
-        self.abort_request_calls = 0
-        self.request_id = None
-        # Ugly, remove dependency when possible
-        self.parallel_config = ParallelConfig()
-        self.model_config = MockModelConfig()
-
-    async def step_async(self, virtual_engine):
-        # PP size is 1, ignore virtual engine
-        self.step_calls += 1
-        return [RequestOutput(
-            request_id=self.request_id)] if self.request_id else []
-
-    async def process_model_inputs_async(self, *args, **kwargs):
-        pass
-
-    async def stop_remote_worker_execution_loop_async(self):
-        pass
-
-    def generate(self, request_id):
-        self.request_id = request_id
-
-    def stop_generating(self):
-        self.request_id = None
-
-    def add_request(self, **kwargs):
-        del kwargs  # Unused
-        self.add_request_calls += 1
-        print(f'Request calls: {self.add_request_calls}')
-
-    async def add_request_async(self, **kwargs):
-        self.add_request_calls += 1
-        return
-
-    def abort_request(self, request_id):
-        del request_id  # Unused
-        self.abort_request_calls += 1
-
-    def has_unfinished_requests(self):
-        return self.request_id is not None
-
-    def has_unfinished_requests_for_virtual_engine(self, virtual_engine):
-        return self.request_id is not None
-
-
-class MockAsyncLLMEngine(AsyncLLMEngine):
-    _engine_class = MockEngine
-
-
-@pytest.mark.asyncio
-async def test_new_requests_event():
-    params = SamplingParams()
-
-    engine = MockAsyncLLMEngine()
-    engine.start_background_loop()
-    await asyncio.sleep(0.01)
-    assert engine.engine.step_calls == 0
-
-    await engine.add_request("1", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 1
-    assert engine.engine.step_calls == 1
-
-    await engine.add_request("2", "", params)
-    engine.engine.generate("2")
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    await asyncio.sleep(0)
-    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls >= 2
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls >= 3
-    engine.engine.stop_generating()
-    await asyncio.sleep(0.001)
-    old_step_calls = engine.engine.step_calls
-    await asyncio.sleep(0.001)
-    assert engine.engine.step_calls == old_step_calls
-
-    await engine.add_request("3", "", params)
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-    await asyncio.sleep(0.01)
-    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == old_step_calls + 1
-
-    engine = MockAsyncLLMEngine()
-    assert engine.get_model_config() is not None
-    assert engine.get_tokenizer() is not None
-    assert engine.get_decoding_config() is not None
-
-
-def start_engine():
-    wait_for_gpu_memory_to_clear(
-        devices=list(range(torch.cuda.device_count())),
-        threshold_bytes=2 * 2**30,
-        timeout_s=60,
-    )
-
-    num_scheduler_steps = int(os.getenv("NUM_SCHEDULER_STEPS", "1"))
-    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
-
-    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
-                        enforce_eager=True,
-                        num_scheduler_steps=num_scheduler_steps))
-
-
-def uid() -> str:
-    return str(uuid.uuid4())
-
-
-@pytest_asyncio.fixture(scope="module")
-async def async_engine():
-    # We cannot use monkeypatch since this is a module
-    # scoped fixture and monkeypatch is function scoped.
-    previous_value = os.getenv("VLLM_USE_V1", None)
-    os.environ["VLLM_USE_V1"] = "0"
-    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
-                                                            func=start_engine)
-    try:
-        yield engine
-    finally:
-        engine.shutdown_background_loop()
-        del engine
-        await asyncio.sleep(0.1)
-        cleanup_dist_env_and_memory()
-
-        if previous_value:
-            os.environ["VLLM_USE_V1"] = previous_value
-        else:
-            del os.environ["VLLM_USE_V1"]
-
-
-@pytest.fixture()
-def should_do_global_cleanup_after_test(request) -> bool:
-    # So we can share the async engine fixture between these tests
-    return False
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_asyncio_run(async_engine, stop):
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    async def run(prompt: str):
-        sampling_params = SamplingParams(
-            temperature=0,
-            max_tokens=32,
-            min_tokens=32,
-            stop=stop,
-        )
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  sampling_params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-        return final_output, output_count
-
-    results = await asyncio.gather(
-        run("test0"),
-        run("test0"),
-    )
-    assert len(results) == 2
-    first, second = results
-
-    # remove nondeterministic fields for comparison
-    first[0].metrics = None
-    second[0].metrics = None
-    first[0].request_id = None
-    second[0].request_id = None
-
-    assert str(first) == str(second)
-
-    output_count = results[0][1]
-    if num_scheduler_steps == 1:
-        assert output_count == 32
-    else:
-        assert 1 < output_count < 32
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_output_kinds(async_engine, stop):
-    """Test that output_kind works as expected and that
-    results are equivalent across different kinds."""
-
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        max_tokens=32,
-        min_tokens=32,
-        stop=stop,
-    )
-
-    async def run(prompt: str, kind: RequestOutputKind):
-        params = copy(sampling_params)
-        params.output_kind = kind
-
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            output_count += 1
-            final_output = output
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return (final_output.prompt_token_ids,
-                final_output.outputs[0].token_ids,
-                final_output.outputs[0].text, output_count)
-
-    async def run_deltas(prompt: str):
-        params = copy(sampling_params)
-        params.output_kind = RequestOutputKind.DELTA
-
-        prompt_tokens = None
-        output_tokens: list[int] = []
-        output_text = ""
-        output_count = 0
-        final_output = None
-        async for output in async_engine.generate(prompt,
-                                                  params,
-                                                  request_id=uid()):
-            token_ids = output.outputs[0].token_ids
-            text = output.outputs[0].text
-            final_output = output
-
-            # Ensure we get prompt ids iff we haven't yet received output tokens
-            if output_tokens:
-                assert 1 <= len(token_ids) <= num_scheduler_steps
-                assert stop or text
-                assert not output.prompt_token_ids
-            else:
-                assert output.prompt_token_ids
-                prompt_tokens = output.prompt_token_ids
-
-            output_tokens.extend(token_ids)
-            output_text += text
-
-            output_count += 1
-
-        assert final_output is not None
-        assert final_output.finished
-
-        return prompt_tokens, output_tokens, output_text, output_count
-
-    results = await asyncio.gather(
-        run("common input prompt", RequestOutputKind.CUMULATIVE),
-        run("common input prompt", RequestOutputKind.FINAL_ONLY),
-        run_deltas("common input prompt"))
-
-    # Make sure outputs are the same
-    prompt_set = set(tuple(prompt_ids) for prompt_ids, _, _, _ in results)
-    assert len(prompt_set) == 1
-
-    text_set = set(text for _, _, text, _ in results)
-    assert len(text_set) == 1
-
-    tokens_set = set(tuple(ids) for _, ids, _, _ in results)
-    assert len(tokens_set) == 1
-
-    cumulative, final, deltas = results
-
-    # output message counts
-    assert cumulative[3] == deltas[3]
-
-    if num_scheduler_steps == 1:
-        assert cumulative[3] == 32
-    else:
-        assert 1 < cumulative[3] < 32
-
-    assert final[3] == 1
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_cancellation(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-    num_scheduler_steps = scheduler_config.num_scheduler_steps
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=13,
-        max_tokens=13,
-        stop=stop,
-    )
-
-    stop_at = 5 if num_scheduler_steps == 1 else 1
-
-    request_id = uid()
-
-    i = 0
-    with pytest.raises(CancelledError):
-        async for output in async_engine.generate("test2",
-                                                  sampling_params,
-                                                  request_id=request_id):
-            assert not output.finished
-            i += 1
-            if i == stop_at:
-                await async_engine.abort(request_id)
-
-    assert i == stop_at
-
-
-@pytest.mark.asyncio(scope="module")
-@pytest.mark.parametrize("stop", [None, ["a stop string"]])
-async def test_delayed_generator(async_engine, stop):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-        stop=stop,
-    )
-
-    stream = async_engine.generate("test3", sampling_params, request_id=uid())
-    i = 0
-    final_output: Optional[RealRequestOutput] = None
-    async for output in stream:
-        final_output = output
-        if i == 0:
-            # wait for generation to complete before consuming
-            # the remaining messages
-            await asyncio.sleep(1)
-        if i < 9:
-            assert not output.finished
-        i += 1
-
-    assert i == 10
-    assert final_output is not None
-    assert len(final_output.outputs[0].token_ids) == 10
-    assert final_output.finished
-
-
-@pytest.mark.asyncio(scope="module")
-async def test_invalid_argument(async_engine):
-    scheduler_config = await async_engine.get_scheduler_config()
-
-    if scheduler_config.num_scheduler_steps != 1:
-        pytest.skip("no need to test this one with multistep")
-
-    sampling_params = SamplingParams(
-        temperature=0,
-        min_tokens=10,
-        max_tokens=10,
-    )
-
-    # Targeting specific DP rank only supported in v1 multi-instance DP
-    with pytest.raises(ValueError):
-        async for _ in async_engine.generate("test",
-                                             sampling_params,
-                                             request_id=uid(),
-                                             data_parallel_rank=0):
-            pass
diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py
deleted file mode 100644
index 1851eeeda790..000000000000
--- a/tests/async_engine/test_request_tracker.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.async_llm_engine import RequestTracker
-from vllm.outputs import RequestOutput
-
-
-@pytest.mark.asyncio
-async def test_request_tracker():
-    tracker = RequestTracker()
-    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 1
-    assert new[0]["request_id"] == "1"
-    assert not aborted
-    assert not stream_1.finished
-
-    stream_2 = tracker.add_request("2")
-    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert len(new) == 2
-    assert new[0]["request_id"] == "2"
-    assert new[1]["request_id"] == "3"
-    assert not aborted
-    assert not stream_2.finished
-    assert not stream_3.finished
-
-    # request_ids must be unique
-    with pytest.raises(KeyError):
-        tracker.add_request("1")
-    assert not tracker.new_requests_event.is_set()
-
-    tracker.abort_request("1")
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert len(aborted) == 1
-    assert "1" in aborted
-    assert not new
-    assert stream_1.finished
-
-    stream_4 = tracker.add_request("4")
-    tracker.abort_request("4")
-    assert tracker.new_requests_event.is_set()
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    # aborted new requests will cancel each other out -
-    # there's no need for them to propagate into the
-    # engine
-    assert not aborted
-    assert not new
-    assert stream_4.finished
-
-    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.is_set()
-    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], finished=True))
-    await tracker.wait_for_new_requests()
-    new, aborted = tracker.get_new_and_aborted_requests()
-    assert not tracker.new_requests_event.is_set()
-    assert not aborted
-    assert len(new) == 1
-    assert new[0]["request_id"] == "5"
-    assert stream_2.finished
-    assert not stream_5.finished
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
deleted file mode 100644
index 4816b76996fc..000000000000
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-It tests chunked prefill. Chunked prefill can be enabled by
-enable_chunked_prefill=True. If prefill size exceeds max_num_batched_tokens,
-prefill requests are chunked.
-
-Run `pytest tests/models/test_chunked_prefill.py`.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-from ..utils import multi_gpu_test
-
-if TYPE_CHECKING:
-    from .conftest import HfRunner, VllmRunner
-
-MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B-Instruct",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the file.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Checks exact match decode between huggingface model and vllm runner with
-    chunked prefill.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        max_num_seqs = chunked_prefill_token_size
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", [
-    pytest.param("FLASHINFER",
-                 marks=pytest.mark.skipif(
-                     current_platform.is_rocm(),
-                     reason="FLASHINFER isn't supported on ROCm")),
-    "FLASH_ATTN"
-])
-def test_models_distributed(
-    hf_runner: HfRunner,
-    vllm_runner: VllmRunner,
-    example_prompts,
-    model: str,
-    distributed_executor_backend: str,
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-        if (model == "meta-llama/Llama-3.2-1B-Instruct"
-                and distributed_executor_backend == "ray"):
-            # test Ray Compiled Graph
-            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
-            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
-
-        dtype = "half"
-        max_tokens = 5
-        chunked_prefill_token_size = 16
-
-        # Add a chunked prefill config.
-        max_num_seqs = min(chunked_prefill_token_size, 256)
-        assert chunked_prefill_token_size != -1
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-        # NOTE: take care of the order. run vLLM first, and then run HF.
-        # vLLM needs a fresh new process without cuda initialization.
-        # if we run HF first, the cuda initialization will be done and it
-        # will hurt multiprocessing backend with
-        # fork method (the default method).
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                max_num_seqs=max_num_seqs,
-                enable_chunked_prefill=enable_chunked_prefill,
-                max_num_batched_tokens=max_num_batched_tokens,
-                distributed_executor_backend=distributed_executor_backend,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(
-                example_prompts,
-                max_tokens,
-            )
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(
-    "kv_cache_dtype,model",
-    [("fp8_e4m3",
-      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-@pytest.mark.skipif(current_platform.is_rocm(),
-                    reason="machete_prepack_B isn't supported on ROCm")
-def test_models_with_fp8_kv_cache(
-    vllm_runner: VllmRunner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Check output logprobs match between no_chunked_prefill and chunked_prefill
-    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
-    so here we only check chunked prefill.
-    """
-    NUM_LOG_PROBS = 8
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
-
-
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("dtype", ["half"])
-def test_with_prefix_caching(
-    vllm_runner: VllmRunner,
-    max_tokens: int,
-    enforce_eager: bool,
-    chunk_size: int,
-    tensor_parallel_size: int,
-    dtype: str,
-) -> None:
-    """
-    Checks exact match decode with and without prefix caching
-    with chunked prefill enabled.
-    """
-    model = "meta-llama/Llama-3.2-1B-Instruct"
-    # The common prompt has 142 tokens with Llama-2 tokenizer.
-    common_prompt = "You are a helpful AI assistant " * 20
-    unique_prompts = [
-        "Question",  # Warmup
-        "Question",  # Fully cached
-        "Another question",  # Partial cached
-    ]
-    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
-
-    max_num_batched_tokens = max_num_seqs = chunk_size
-    outputs = {}  # type: ignore
-    for enable in (True, False):
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            outputs[enable] = []
-            for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy(
-                    [prompt],
-                    max_tokens,
-                )
-
-    check_outputs_equal(
-        outputs_0_lst=outputs[False],
-        outputs_1_lst=outputs[True],
-        name_0="w/o prefix caching",
-        name_1="with prefix caching",
-    )
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
deleted file mode 100644
index db2fa2f6bef6..000000000000
--- a/tests/basic_correctness/test_preemption.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the short outputs of HF and vLLM when using greedy sampling.
-
-VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.
-
-Run `VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1
-pytest tests/basic_correctness/test_preemption.py`.
-"""
-import pytest
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import SamplingParams
-from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
-                                 ENABLE_ARTIFICIAL_PREEMPT)
-
-from ..models.utils import check_outputs_equal
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
-    so use VLLM_USE_V1=0 for all tests in the file.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.fixture(scope="module", autouse=True)
-def check_settings():
-    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-        "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1."
-        "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 "
-        "pytest tests/basic_correctness/test_preemption.py`")
-
-
-@pytest.fixture
-def distributed_executor_backend() -> str:
-    # When SPMD worker is used, use distributed_executor_backend="ray"
-    # to test delta input optimization works with preemption.
-    return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [96])
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
-def test_chunked_prefill_recompute(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Ensure that chunked prefill works with preemption."""
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_seqs=max_num_seqs,
-            distributed_executor_backend=distributed_executor_backend,
-            disable_log_stats=False,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption(
-    caplog_vllm,
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """By default, recompute preemption is enabled"""
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            disable_log_stats=False,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-        total_preemption = (
-            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
-
-    assert ("is preempted by PreemptionMode.RECOMPUTE mode because there "
-            "is not enough KV cache space." in caplog_vllm.text)
-    # Ensure the count bucket of request-level histogram metrics matches
-    # the number of requests as a simple sanity check to ensure metrics are
-    # generated
-    preemption_metrics = None
-    for m in REGISTRY.collect():
-        if m.name == "vllm:num_preemptions":
-            preemption_metrics = m
-    assert preemption_metrics is not None
-    total_recorded_preemption = 0
-    for sample in preemption_metrics.samples:
-        total_recorded_preemption += sample.value
-    assert total_preemption == total_recorded_preemption
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [96])
-def test_preemption_infeasible(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    distributed_executor_backend: str,
-) -> None:
-    """Verify infeasible preemption request will be ignored."""
-    BLOCK_SIZE = 16
-    prefill_blocks = 2
-    decode_blocks = max_tokens // BLOCK_SIZE
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            block_size=BLOCK_SIZE,
-            # Not enough gpu blocks to complete a single sequence.
-            # preemption should happen, and the sequence should be
-            # ignored instead of hanging forever.
-            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-        req_outputs = vllm_model.llm.generate(
-            example_prompts,
-            sampling_params=sampling_params,
-        )
-
-        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
-                < ARTIFICIAL_PREEMPTION_MAX_CNT)
-
-    # Verify the request is ignored and not hang.
-    for req_output in req_outputs:
-        outputs = req_output.outputs
-        assert len(outputs) == 1
-        assert outputs[0].finish_reason == "length"
diff --git a/tests/build_cython.py b/tests/build_cython.py
deleted file mode 100644
index 444434e8f0a7..000000000000
--- a/tests/build_cython.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import Cython.Compiler.Options
-from Cython.Build import cythonize
-from setuptools import setup
-
-Cython.Compiler.Options.annotate = True
-
-infiles = []
-
-infiles += [
-    "vllm/engine/llm_engine.py",
-    "vllm/transformers_utils/detokenizer.py",
-    "vllm/engine/output_processor/single_step.py",
-    "vllm/outputs.py",
-    "vllm/engine/output_processor/stop_checker.py",
-]
-
-infiles += [
-    "vllm/core/scheduler.py",
-    "vllm/sequence.py",
-    "vllm/core/block_manager.py",
-]
-
-infiles += [
-    "vllm/model_executor/layers/sampler.py",
-    "vllm/sampling_params.py",
-    "vllm/utils/__init__.py",
-]
-
-setup(ext_modules=cythonize(infiles,
-                            annotate=False,
-                            force=True,
-                            compiler_directives={
-                                'language_level': "3",
-                                'infer_types': True
-                            }))
-
-# example usage: python3 build_cython.py build_ext --inplace
diff --git a/tests/core/__init__.py b/tests/core/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/core/block/__init__.py b/tests/core/block/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py
deleted file mode 100644
index 6afe98d78ce8..000000000000
--- a/tests/core/block/conftest.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-
-@pytest.fixture()
-def should_do_global_cleanup_after_test() -> bool:
-    """Disable the global cleanup fixture for tests in this directory. This
-    provides a ~10x speedup for unit tests that don't load a model to GPU.
-
-    This requires that tests in this directory clean up after themselves if they
-    use the GPU.
-    """
-    return False
diff --git a/tests/core/block/e2e/__init__.py b/tests/core/block/e2e/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
deleted file mode 100644
index e2c6c66b259c..000000000000
--- a/tests/core/block/e2e/conftest.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections.abc import Iterable
-from typing import Callable, Optional
-
-import pytest
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.model_executor.utils import set_random_seed
-
-
-@pytest.fixture
-def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                           baseline_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                baseline_llm_kwargs, seed)
-
-
-@pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                       test_llm_kwargs, seed):
-    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                                test_llm_kwargs, seed)
-
-
-def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
-                         distinct_llm_kwargs, seed):
-    kwargs = {
-        **common_llm_kwargs,
-        **per_test_common_llm_kwargs,
-        **distinct_llm_kwargs,
-    }
-
-    def generator_inner():
-        llm = LLM(**kwargs)
-
-        set_random_seed(seed)
-
-        yield llm
-        del llm
-        cleanup_dist_env_and_memory()
-
-    for llm in generator_inner():
-        yield llm
-        del llm
-
-
-def get_text_from_llm_generator(llm_generator: Iterable[LLM],
-                                prompts,
-                                sampling_params,
-                                llm_cb: Optional[Callable[[LLM],
-                                                          None]] = None):
-    for llm in llm_generator:
-        if llm_cb:
-            llm_cb(llm)
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        text = [output.outputs[0].text for output in outputs]
-        del llm
-
-    return text
-
-
-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
-    for llm in llm_generator:
-        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
-        token_ids = [output.outputs[0].token_ids for output in outputs]
-        del llm
-
-    return token_ids
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
deleted file mode 100644
index 93222b564ebe..000000000000
--- a/tests/core/block/e2e/test_correctness.py
+++ /dev/null
@@ -1,479 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from itertools import cycle
-
-import pytest
-
-from vllm import SamplingParams
-
-from .conftest import get_token_ids_from_llm_generator
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_block_manager_with_preemption(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
-    """Verify block manager produces same outputs even when there is preemption.
-
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-
-    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted.
-
-    NOTE: We want a significant number of generated tokens so that any incorrect
-    KV mapping has time to build up error.
-
-    NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
-    keep this test.
-    """
-    output_len = 1024
-    temperature = 0.0
-
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # Our prompts will generate 128 tokens; since the prompts themselves are
-        # small, we don't need much KV space beyond 128.
-        "max_model_len": 160,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-    }])
-@pytest.mark.parametrize(
-    "per_test_common_llm_kwargs",
-    [
-        {
-            "block_size": 16,
-
-            # Allow only 2 sequences of ~128 tokens in worst case.
-            # Note 8 = 128/block_size
-            "num_gpu_blocks_override": 2 * (8 + 1),
-        },
-        {
-            "block_size": 8,
-
-            # Allow only 2 sequences of ~128 tokens in worst case.
-            # Note 16 = 128/block_size
-            "num_gpu_blocks_override": 2 * (16 + 2),
-        }
-    ])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "num_lookahead_slots": 0,
-}])
-@pytest.mark.parametrize(
-    "test_llm_kwargs",
-    [
-        {
-            # We run one test with block_size < lookahead_slots, one test with
-            # block_size > lookahead_slots
-            "num_lookahead_slots": 10,
-            "preemption_mode": "swap",
-        },
-        {
-            "num_lookahead_slots": 10,
-            "preemption_mode": "recompute",
-        }
-    ])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
-                                                   test_llm_generator,
-                                                   batch_size):
-    """Verify vLLM produces the same output with greedy sampling, when lookahead
-    scheduling is used vs. not.
-
-    Lookahead scheduling is not expected to modify the output, as it simply
-    allocates empty slots ahead of the known token ids in a sliding fashion.
-
-    This test constrains the total number of blocks to force preemption. It also
-    varies the block size so that the lookahead size is less than and greater
-    than the block size.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    print('Getting token ids without lookahead scheduling')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids with lookahead scheduling')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [
-        {
-            # Use a small model for a fast test.
-            "model": "facebook/opt-125m",
-
-            # skip cuda graph creation for fast test.
-            "enforce_eager": True,
-            "enable_chunked_prefill": True,
-        },
-    ])
-@pytest.mark.parametrize("per_test_common_llm_kwargs",
-                         [{
-                             "block_size": 16,
-                             "max_num_batched_tokens": 2,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 3,
-                             "max_num_seqs": 2,
-                         }, {
-                             "block_size": 16,
-                             "max_num_batched_tokens": 256,
-                             "max_num_seqs": 10,
-                         }])
-@pytest.mark.parametrize("baseline_llm_kwargs", [
-    {},
-])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "num_lookahead_slots": 0,
-    },
-    {
-        "num_lookahead_slots": 5,
-    },
-])
-@pytest.mark.parametrize("batch_size", [4])
-@pytest.mark.parametrize("seed", [1])
-def test_chunked_prefill_block_manager(baseline_llm_generator,
-                                       test_llm_generator, batch_size):
-    """Verify that chunked prefill works with SelfAttnBlockSpaceManager, 
-    with and without lookahead scheduling.
-    """
-    output_len = 32
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        ("1 + " * 50) + " 1 = ",  # Longer prompt.
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    print('Getting token ids with BlockManager')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids with BlockManager, with lookahead slots.')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-
-        # Enable prefill cache
-        "enable_prefix_caching": True,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "preemption_mode": "swap"
-}, {
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_block_manager_prefix_caching_enabled_with_preemption(
-        baseline_llm_generator, test_llm_generator, batch_size):
-    """Verify block manager produces same outputs even when there is preemption.
-
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-
-    If the output token ids are equivalent, then we have confidence that the KV
-    cache is not corrupted.
-
-    NOTE: We want a significant number of generated tokens so that any incorrect
-    KV mapping has time to build up error.
-
-    NOTE(Kuntai): Though we have removed block manager v1, this test is still
-    useful as it asserts the behavior of block manager v2 (now it is called 
-    SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we  
-    keep this test.
-    """
-    output_len = 1024
-    temperature = 0.0
-
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    print('Getting token ids from block manager')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids from block manager, with preemption')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # Allow only 5 sequences of ~1024 tokens in worst case.
-        "block_size": 16,
-        "num_gpu_blocks_override": 5 * (64 + 1),
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-    "preemption_mode": "swap"
-}, {
-    "enable_prefix_caching": True,
-    "preemption_mode": "recompute"
-}])
-@pytest.mark.parametrize("batch_size", [10])
-@pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
-                                             test_llm_generator, batch_size):
-    """Verify block manager v2 with auto prefix caching enabled produces same
-    outputs as auto prefix caching disabled, even when there is preemption.
-
-    This constructs two LLM, each with limited number of GPU blocks. The limit
-    is decided such that as the sequences in the batch grow, sequences must be
-    preempted and removed from cache.
-
-    If the output token ids are equivalent, then we have confidence that auto
-    prefix caching itself at least don't cause result error.
-    """
-    output_len = 1024
-    temperature = 0.0
-
-    # We want to ensure equality even with preemption.
-    # We force the total block size to be 1 + cdiv(output_len, block_size)
-    # so that only one sequence can fit at a time (once the sequences grow).
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    print('Getting token ids with APC disabled')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-
-        # we keep the blocks small, so that hit eviction quickly
-        "max_model_len": 48,
-        "block_size": 16,
-        "num_gpu_blocks_override": 3,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{
-    "enable_prefix_caching": False
-}])
-@pytest.mark.parametrize("test_llm_kwargs", [{
-    "enable_prefix_caching": True,
-}])
-@pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
-                                                  test_llm_generator):
-    """Verify block manager v2 with auto prefix caching could works normal
-    even when eviction started.
-    With APC enabled, all blocks are held by native block at the beginning.
-    Then blocks are managed by evictor instead. If cache hit at the evitor's
-    block, then it could be reused, or we need to recompute its kv cache.
-    """
-    output_len = 10
-    temperature = 0.0
-
-    prompts = [
-        "You are a helpful assistant. Please answer truthfully and write "
-        "out your thinking step by step to be sure you get the right answer. "
-        "If you make a mistake, attempt to correct it. who are you?",
-        "You are a helpful assistant. Please answer truthfully and write out "
-        "your thinking step by step to be sure you get the right answer. You "
-        "are helpful and harmless and you follow ethical guidelines. "
-        "who are you?"
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    print('Getting token ids with APC disabled')
-    baseline_token_ids = get_token_ids_from_llm_generator(
-        baseline_llm_generator, prompts, sampling_params)
-
-    print('Getting token ids with APC enabled')
-    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
-                                                      prompts, sampling_params)
-
-    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
-                                                    test_token_ids):
-        assert expected_token_ids == actual_token_ids
-
-    assert baseline_token_ids == test_token_ids
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
deleted file mode 100644
index 4d67eea2264b..000000000000
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-
-import pytest
-
-from tests.kernels.utils import override_backend_env_variable
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-from .conftest import get_text_from_llm_generator
-
-# relatively small model with 4k sliding window
-MODEL = "bigcode/starcoder2-3b"
-BLOCK_SIZE = 16
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        # needed due to https://github.com/vllm-project/vllm/issues/1908#issuecomment-2101122008
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("batch_size", [5])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
-                                  batch_size, seed, backend, monkeypatch):
-    """
-    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
-    asks for value of one of them (which is outside the sliding window).
-    If we tell it upfront which we are going to be looking for, then
-    it answers correctly (mostly).
-
-    Additionally, we compare the results of the v1 and v2 managers.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-
-    override_backend_env_variable(monkeypatch, backend)
-
-    sampling_params = SamplingParams(
-        max_tokens=1024,
-        ignore_eos=True,
-        temperature=0.0,
-    )
-
-    prompts, answer, indices = prep_prompts(batch_size)
-
-    baseline_texts = get_text_from_llm_generator(baseline_llm_generator,
-                                                 prompts,
-                                                 sampling_params,
-                                                 llm_cb=check_window(prompts))
-
-    check_answers(indices, answer, baseline_texts)
-
-    print('Getting token ids from block manager v2')
-    test_texts = get_text_from_llm_generator(test_llm_generator, prompts,
-                                             sampling_params)
-    check_answers(indices, answer, test_texts)
-
-    cmp = [
-        expected_text == actual_text
-        for expected_text, actual_text in zip(baseline_texts, test_texts)
-    ]
-    print(cmp)
-    # make sure it's mostly OK; this is possibly because https://github.com/vllm-project/vllm/pull/4768
-    # however, https://github.com/vllm-project/vllm/issues/3385#issuecomment-1995924290
-    # states that xformers and flash_attn have different ideas about the window
-    # size anyways
-    assert sum(cmp) > 0.7 * len(cmp)
-
-
-@pytest.mark.parametrize(
-    "common_llm_kwargs",
-    [{
-        "model": MODEL,
-
-        # skip cuda graph creation for fast test.
-        "enforce_eager": True,
-        "block_size": BLOCK_SIZE,
-        "num_gpu_blocks_override": 100000 // BLOCK_SIZE,
-    }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}])
-@pytest.mark.parametrize("batch_size", [5])
-@pytest.mark.parametrize("seed", [1])
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
-                                        backend, monkeypatch):
-    """
-    This is similar to test_sliding_window_retrieval, however, it doesn't
-    compare against the v1 block manager since v1 doesn't support
-    chunked prefill with sliding window.
-
-    The results with and without chunked prefill are not the same due to
-    numerical instabilities.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-
-    sampling_params = SamplingParams(
-        max_tokens=10,
-        ignore_eos=True,
-        temperature=0.0,
-    )
-
-    prompts, answer, indices = prep_prompts(batch_size)
-
-    # We don't compare with the baseline model here, since the results
-    # slightly different due to different tailing in attention.
-    test_texts = get_text_from_llm_generator(test_llm_generator,
-                                             prompts,
-                                             sampling_params,
-                                             llm_cb=check_window(prompts))
-    check_answers(indices, answer, test_texts)
-
-
-def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
-    """
-    Generate prompts which a bunch of assignments,
-    then asking for the value of one of them.
-    The prompt is just under 10k tokens; sliding window is 4k
-    so the answer is outside sliding window, but should still be correct.
-
-    Args:
-        batch_size: number of prompts to generate
-        ln_range: an argument to control the length of the prompt
-    """
-    prompts: list[str] = []
-    answer: list[int] = []
-    indices: list[int] = []
-    random.seed(1)
-    for _ in range(batch_size):
-        idx = random.randint(30, 90)
-        indices.append(idx)
-        prompt = "```python\n# We set a number of variables, " + \
-                 f"x{idx} will be important later\n"
-        ln = random.randint(*ln_range)
-        for k in range(30, ln):
-            v = random.randint(10, 99)
-            if k == idx:
-                answer.append(v)
-            prompt += f"x{k} = {v}\n"
-        prompt += f"# Now, we check the value of x{idx}:\n"
-        prompt += f"assert x{idx} == "
-        prompts.append(prompt)
-    return prompts, answer, indices
-
-
-def check_answers(indices: list[int],
-                  answer: list[int],
-                  outputs: list[str],
-                  accept_rate: float = 0.7):
-    answer2 = [int(text[0:2].strip()) for text in outputs]
-    print(list(zip(indices, zip(answer, answer2))))
-    numok = 0
-    for a1, a2 in zip(answer, answer2):
-        if a1 == a2:
-            numok += 1
-    frac_ok = numok / len(answer)
-    print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok >= accept_rate
-
-
-def check_window(prompts: list[str]):
-
-    def inner(llm: LLM):
-        sliding_window = llm.llm_engine.model_config.get_sliding_window()
-        assert sliding_window and sliding_window > 0
-        assert any(
-            len(llm.get_tokenizer().tokenize(prompt)) > sliding_window
-            for prompt in prompts)
-
-    return inner
diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
deleted file mode 100644
index 9eed264fd7d4..000000000000
--- a/tests/core/block/test_block_manager.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                   STR_NOT_IMPL_ENC_DEC_SWA)
-from vllm.core.block_manager import SelfAttnBlockSpaceManager
-from vllm.core.interfaces import AllocStatus
-from vllm.sequence import Logprob, SequenceStatus
-from vllm.utils import chunk_list
-
-from ..utils import (create_dummy_prompt, create_seq_group,
-                     create_seq_group_encoder_decoder)
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
-@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
-@pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
-                                num_gpu_blocks: int, watermark: float):
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        watermark=watermark,
-    )
-    num_watermark_blocks = int(watermark * num_gpu_blocks)
-
-    num_output_blocks_per_seq = 1
-
-    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
-    # the current implementation assumes all seqs are new prompts / don't have
-    # different output lens.
-    num_output_blocks = num_output_blocks_per_seq
-
-    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
-        seq_group = create_seq_group(
-            seq_prompt_len=block_size * num_prompt_blocks,
-            seq_output_lens=[
-                block_size * num_output_blocks_per_seq
-                for _ in range(num_seqs_per_group)
-            ],
-        )
-
-        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
-
-        can_allocate_result = block_manager.can_allocate(seq_group)
-
-        num_required_blocks = num_prompt_blocks + num_output_blocks
-
-        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
-            assert can_allocate_result == AllocStatus.NEVER
-        elif num_gpu_blocks >= num_required_blocks:
-            assert can_allocate_result == AllocStatus.OK
-        else:
-            assert can_allocate_result == AllocStatus.LATER
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
-@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
-@pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group_encoder_decoder(block_size: int,
-                                                num_seqs_per_group: int,
-                                                num_gpu_blocks: int,
-                                                watermark: float):
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        watermark=watermark,
-    )
-    num_watermark_blocks = int(watermark * num_gpu_blocks)
-
-    num_output_blocks_per_seq = 1
-
-    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
-    # the current implementation assumes all seqs are new prompts / don't have
-    # different output lens.
-    num_output_blocks = num_output_blocks_per_seq
-
-    for bdx, num_prompt_blocks in enumerate(
-            range(1, num_gpu_blocks - num_output_blocks)):
-        num_cross_blocks_per_seq = num_prompt_blocks
-
-        seq_group = create_seq_group_encoder_decoder(
-            seq_prompt_len=block_size * num_prompt_blocks,
-            seq_output_lens=[
-                block_size * num_output_blocks_per_seq
-                for _ in range(num_seqs_per_group)
-            ],
-            request_id=str(bdx))
-
-        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
-
-        can_allocate_result = block_manager.can_allocate(seq_group)
-
-        num_required_blocks = num_prompt_blocks + \
-                              num_output_blocks + \
-                              num_cross_blocks_per_seq
-
-        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
-            assert can_allocate_result == AllocStatus.NEVER
-        elif num_gpu_blocks >= num_required_blocks:
-            assert can_allocate_result == AllocStatus.OK
-        else:
-            assert can_allocate_result == AllocStatus.LATER
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_gpu_blocks", [16])
-@pytest.mark.parametrize("num_seqs_per_group", [1])
-@pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
-                                                     num_seqs_per_group: int,
-                                                     num_gpu_blocks: int,
-                                                     watermark: float):
-    '''
-    SWA short for Sliding Window Attention.
-
-    At time of writing block manager does not support SWA.
-
-    However even when SWA is implemented for block manager,
-    there will still most likely be a separate workstream required
-    to enable SWA for encoder/decoder models.
-
-    Therefore this test enforces that one of the following cases
-    hold true:
-    1. Block manager does not support SWA at all (true at time of writing)
-    2. Block manager fails with NotImplementError when SWA is enabled
-       AND a SequenceGroup with an encoder sequence (i.e. in support of an
-       encoder/decoder model) is passed into can_allocate() as an argument
-
-    The setup for this test is stripped down version of
-    test_can_allocate_seq_group_encoder_decoder()
-    '''
-
-    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
-        block_manager = SelfAttnBlockSpaceManager(
-            block_size=block_size,
-            num_gpu_blocks=num_gpu_blocks,
-            num_cpu_blocks=1024,
-            watermark=watermark,
-            sliding_window=5  # SWA
-        )
-
-        num_output_blocks_per_seq = 1
-        num_prompt_blocks = 1
-        num_output_blocks = num_output_blocks_per_seq
-        seq_group = create_seq_group_encoder_decoder(
-            seq_prompt_len=block_size * num_prompt_blocks,
-            seq_output_lens=[
-                block_size * num_output_blocks_per_seq
-                for _ in range(num_seqs_per_group)
-            ],
-            request_id="0")
-
-        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
-        block_manager.can_allocate(seq_group)
-
-    # Assert that either
-    # 1. Block manager constructor fails with assertion that sliding window
-    #    is not yet supported (most likely near-term outcome at time of
-    #    writing), or
-    # 2. can_allocate() fails with NotImplementedError due to combination of
-    #    encoder/decoder and sliding window attention
-    if isinstance(exc_info.value, NotImplementedError):
-        assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
-    elif isinstance(exc_info.value, AssertionError):
-        assert str(exc_info.value) == "Sliding window not yet supported"
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("num_gpu_blocks", [16])
-@pytest.mark.parametrize("num_seqs_per_group", [1])
-@pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
-        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
-        watermark: float):
-
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        watermark=watermark,
-        enable_caching=True  # Prefix cache
-    )
-
-    num_output_blocks_per_seq = 1
-    num_prompt_blocks = 1
-    num_output_blocks = num_output_blocks_per_seq
-    seq_group = create_seq_group_encoder_decoder(
-        seq_prompt_len=block_size * num_prompt_blocks,
-        seq_output_lens=[
-            block_size * num_output_blocks_per_seq
-            for _ in range(num_seqs_per_group)
-        ],
-        request_id="0")
-
-    assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
-
-    # Assert that either can_allocate() fails with NotImplementedError
-    # due to combination of encoder/decoder and prefix cache
-    with pytest.raises(NotImplementedError) as exc_info:
-        block_manager.can_allocate(seq_group)
-    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
-
-
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("prompt_len", [1, 7, 8])
-@pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 10])
-def test_append_slots(block_size, prompt_len, num_slots_to_append,
-                      num_lookahead_slots):
-    """Verify append_slots consumes the correct number of blocks from the block
-    table.
-    """
-
-    num_gpu_blocks = 1024
-    watermark = 0.1
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        watermark=watermark,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=prompt_len,
-        seq_output_lens=[0],
-    )
-
-    # Allocate seq
-    assert block_manager.can_allocate(seq_group)
-    block_manager.allocate(seq_group)
-
-    # Seq seq to RUNNING
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    # Append tokens to the sequeqnce
-    for token_id in range(num_slots_to_append):
-        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Append slots for new tokens and lookahead slots.
-    free_blocks_before_append = block_manager.get_num_free_gpu_blocks()
-    block_manager.append_slots(seq, num_lookahead_slots)
-    num_consumed_blocks = (free_blocks_before_append -
-                           block_manager.get_num_free_gpu_blocks())
-
-    # Expect consumed blocks to be new blocks required to support the new slots.
-    expected_consumed_blocks = len(
-        list(
-            chunk_list(
-                list(
-                    range(prompt_len + num_slots_to_append +
-                          num_lookahead_slots)),
-                block_size))) - len(
-                    list(chunk_list(list(range(prompt_len)), block_size)))
-    assert num_consumed_blocks == expected_consumed_blocks
-
-
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("num_cpu_blocks", [4])
-@pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
-              enable_caching):
-    """Verify blocks number on src/desc device is correct after swapping in/out
-        sequence group (not missing or extra blocks).
-    """
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-
-    # Swap seq group from CPU -> GPU.
-    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group)
-    cpu_blocks = block_manager.get_block_table(prompt)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == [cpu_blocks[0]]
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
-
-
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [3, 8, 10])
-@pytest.mark.parametrize("enable_caching", [True, False])
-def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
-                  enable_caching):
-    """ Verify the block manager can correctly determine if a sequence group
-        can be swapped in/out.
-    """
-    num_cpu_blocks = num_gpu_blocks
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt, seq_group = create_dummy_prompt(
-        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    prompt.status = SequenceStatus.RUNNING
-
-    # Swap seq group from GPU -> CPU.
-    gpu_blocks = block_manager.get_block_table(prompt)
-    assert block_manager.can_swap_out(seq_group)
-    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_out(seq_group)
-    mapping_keys = [key for key, _ in mapping]
-    assert mapping_keys == gpu_blocks
-    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
-    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
-    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
-    prompt.status = SequenceStatus.SWAPPED
-
-    # At this moment, we still have enough free blocks to swap in the seq group.
-    if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-
-    # During Swapped out, 2 cached blocks were evicted from the GPU,
-    # so the prompt1 can't be swapped in
-    prompt2_len = 2 * block_size - 1
-    prompt2, seq_group2 = create_dummy_prompt(
-        "2",
-        prompt_length=prompt2_len,
-        prompt_tokens=[10000 + i for i in range(prompt2_len)])
-    prompt2.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group2)
-
-    # Swap seq group from CPU -> GPU.
-    if num_lookahead_slots <= block_size:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.LATER
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-
-
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False, True])
-def test_swap_in_infeasible(num_lookahead_slots, enable_caching):
-    """Verifies that swapping fails if there is not enough free blocks
-    to account for unseen tokens and lookahead_slots.
-    """
-    block_size = 8
-    num_cpu_blocks = 1
-    num_gpu_blocks = 1
-    block_manager = SelfAttnBlockSpaceManager(block_size,
-                                              num_cpu_blocks,
-                                              num_gpu_blocks,
-                                              watermark=0,
-                                              enable_caching=enable_caching)
-    prompt_length = block_size - 3
-    assert prompt_length > 0
-    prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length)
-    prompt.status = SequenceStatus.WAITING
-    block_manager.allocate(seq_group)
-    # Emulate a forward pass by appending a single token.
-    # The block manager then knows how many unprocessed
-    # tokens will be written in the next forward pass.
-    token_id = 0
-    prompt.status = SequenceStatus.RUNNING
-    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
-
-    # Swap seq group from GPU -> CPU.
-    assert block_manager.can_swap_out(seq_group)
-    block_manager.swap_out(seq_group)
-    prompt.status = SequenceStatus.SWAPPED
-
-    # Swap seq group from CPU -> GPU.
-    # The number of unseen tokens is 1. If the number of existing
-    # tokens plus the unseen ones and number of lookahead slots exceeds
-    # the total number of available GPU blocks then the swap
-    # should fail.
-    num_unseen_tokens = 1
-    if (num_lookahead_slots + num_unseen_tokens +
-            prompt_length) <= (block_size * num_gpu_blocks):
-        assert block_manager.can_swap_in(seq_group,
-                                         num_lookahead_slots) == AllocStatus.OK
-    else:
-        assert block_manager.can_swap_in(
-            seq_group, num_lookahead_slots) == AllocStatus.NEVER
-
-
-# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
-
-
-@pytest.mark.parametrize("block_size", [8, 16])
-@pytest.mark.parametrize("prompt_len", [10, 300, 1000])
-@pytest.mark.parametrize("num_slots_to_append", [50])
-@pytest.mark.parametrize("sliding_window", [20, 32, 200, 512])
-def test_sliding_window(block_size, prompt_len, num_slots_to_append,
-                        sliding_window):
-    """Verify append_slots consumes the correct number of blocks from the block
-    table.
-    """
-
-    num_gpu_blocks = 1024
-    watermark = 0.1
-    block_manager = SelfAttnBlockSpaceManager(
-        block_size=block_size,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        watermark=watermark,
-        sliding_window=sliding_window,
-    )
-
-    def check_used(min_n, max_n=None):
-        if max_n is None:
-            max_n = min_n
-        used = num_gpu_blocks - block_manager.get_num_free_gpu_blocks()
-        assert min_n <= used
-        assert used <= max_n
-
-    def num_blocks(num_tokens):
-        return (num_tokens + block_size - 1) // block_size
-
-    check_used(0)
-
-    seq_group = create_seq_group(
-        seq_prompt_len=prompt_len,
-        seq_output_lens=[0],
-    )
-
-    check_used(0)
-
-    # Allocate seq
-    assert block_manager.can_allocate(seq_group)
-    block_manager.allocate(seq_group)
-
-    check_used(num_blocks(prompt_len))
-
-    # Seq seq to RUNNING
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    seq.data.update_num_computed_tokens(prompt_len)
-    check_used(num_blocks(prompt_len))
-
-    # this is how we compute it in SelfAttnBlockSpaceManager.__init__
-    sliding_blocks = (sliding_window // block_size) + 2
-    # plus one block for null block
-    sliding_blocks += 1
-
-    # Append tokens to the sequeqnce
-    for token_id in range(num_slots_to_append):
-        seq.append_token_id(token_id, {token_id: Logprob(0.0)})
-        seq.data.update_num_computed_tokens(1)
-        block_manager.append_slots(seq, num_lookahead_slots=0)
-        if prompt_len < sliding_window + 10:
-            check_used(0, sliding_blocks + 1)
-        else:
-            check_used(sliding_blocks, sliding_blocks + 1)
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
deleted file mode 100644
index ba085001136b..000000000000
--- a/tests/core/block/test_block_table.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.core.block.block_table import BlockTable
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.utils import Device, cdiv, chunk_list
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-def test_allocate_naive(block_size: int, sequence_len: int):
-    """Test the allocation of blocks using the naive allocator.
-
-    This test creates a CpuGpuBlockAllocator with the specified block size and
-    number of blocks. It then allocates multiple BlockTables with varying
-    sequence lengths and verifies that the number of free blocks decreases as
-    expected after each allocation.
-    """
-    assert block_size > 1
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type="naive",
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-
-    block_tables: list[BlockTable] = []
-    for i in range(5):
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
-
-        block_tables.append(
-            BlockTable(
-                block_size=block_size,
-                block_allocator=allocator,
-            ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-def test_allocate_prefix_caching(block_size: int, sequence_len: int):
-    """Test the allocation of blocks using the prefix caching allocator.
-
-    This test creates a CpuGpuBlockAllocator with the specified block size and
-    number of blocks, using the prefix caching allocator. It then allocates
-    multiple BlockTables with varying sequence lengths and verifies that the
-    number of free blocks decreases as expected after each allocation.
-
-    The test expects all sequences to share allocations, except for their last
-    block, which may be mutable. It calculates the expected number of immutable
-    and mutable blocks per allocation based on the sequence length and block
-    size.
-    """
-    assert block_size > 1
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type="prefix_caching",
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    chunked_tokens = list(chunk_list(token_ids, block_size))
-    num_mutable_blocks_per_alloc = 0 if len(
-        chunked_tokens[-1]) == block_size else 1
-    num_immutable_blocks_per_alloc = len(
-        chunked_tokens) - num_mutable_blocks_per_alloc
-
-    block_tables: list[BlockTable] = []
-    for alloc_i in range(1, 6):
-
-        block_tables.append(
-            BlockTable(
-                block_size=block_size,
-                block_allocator=allocator,
-            ))
-        block_tables[-1].allocate(token_ids=token_ids, device=Device.GPU)
-
-        # Expect all sequences to share allocations, except for their last block
-        # (which may be mutable).
-        assert allocator.get_num_free_blocks(
-            device=Device.GPU) == num_gpu_blocks - (
-                num_immutable_blocks_per_alloc + num_mutable_blocks_per_alloc *
-                (alloc_i))
-
-
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-@pytest.mark.parametrize("device", ["cpu", "gpu"])
-def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
-                       device: str):
-    """Test the allocation and freeing of blocks using different allocators and
-    devices.
-
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, allocator type, and device. It then allocates a BlockTable
-    multiple times with the same sequence and verifies that the number of free
-    blocks remains consistent after each allocation and freeing.
-    """
-    device = Device[device.upper()]
-
-    num_device_blocks = 1024
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_device_blocks,
-        num_cpu_blocks=num_device_blocks,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    for i in range(5):
-        block_table.allocate(token_ids=token_ids, device=device)
-        assert allocator.get_num_free_blocks(
-            device) == num_device_blocks - num_blocks_per_alloc
-        assert all(block_id is not None
-                   for block_id in block_table.physical_block_ids)
-
-        block_table.free()
-        assert allocator.get_num_free_blocks(device) == num_device_blocks
-
-
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_allocation(block_size: int, sequence_len: int,
-                                     append_len: int, allocator_type: str):
-    """Test the allocation behavior when appending token IDs to a BlockTable.
-
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, and allocator type. It then allocates a BlockTable with an
-    initial sequence and appends additional token IDs to it. The test verifies
-    that the number of allocated blocks before and after appending matches the
-    expected values.
-    """
-
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + token_ids_to_append,
-                        block_size))) - num_expected_blocks_before_append
-
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
-    block_table.append_token_ids(token_ids_to_append)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
-
-
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("num_empty_slots", [1, 16, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
-                                           num_empty_slots: int,
-                                           allocator_type: str):
-    """Test the allocation behavior when ensuring a certain number of empty
-    slots in a BlockTable.
-
-    This test creates a CpuGpuBlockAllocator with the specified block size,
-    number of blocks, and allocator type. It then allocates a BlockTable with an
-    initial sequence and ensures a certain number of empty slots. The test
-    verifies that the number of allocated blocks before and after ensuring empty
-    slots matches the expected values. It also checks that filling up the empty
-    slots does not consume additional blocks.
-    """
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    num_expected_blocks_before_append = len(
-        list(chunk_list(token_ids, block_size)))
-    num_expected_appended_blocks = len(
-        list(chunk_list(token_ids + [-1] * num_empty_slots,
-                        block_size))) - num_expected_blocks_before_append
-
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-
-    # Assert that the empty slots consume the expected number of additional
-    # blocks.
-    assert len(
-        block_table.physical_block_ids) == num_expected_blocks_before_append
-    block_table.ensure_num_empty_slots(num_empty_slots)
-    assert len(
-        block_table.physical_block_ids
-    ) == num_expected_blocks_before_append + num_expected_appended_blocks
-
-    # Now, ensure no additional blocks consumed as we fill up the empty slots.
-    num_free_blocks = allocator.get_num_free_blocks(device=Device.GPU)
-    block_table.append_token_ids(token_ids=list(range(num_empty_slots)))
-    assert num_free_blocks == allocator.get_num_free_blocks(device=Device.GPU)
-
-
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 9])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("append_size", [1, 4, 129])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
-                                          append_len: int, allocator_type: str,
-                                          append_size: int):
-    """Verify token ids are correctly appended. Appends various amounts of
-    token ids in various append sizes, and verifies the final sequence is
-    correct.
-    """
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=1024,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-
-    appended_so_far: list[int] = []
-    for append in chunk_list(token_ids_to_append, append_size):
-        block_table.append_token_ids(append)
-        appended_so_far.extend(append)
-
-        assert block_table._get_all_token_ids() == token_ids + appended_so_far
-
-    assert block_table._get_all_token_ids() == token_ids + token_ids_to_append
-
-
-@pytest.mark.parametrize("seq_len", [1, 9, 129])
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_fork(seq_len: int, block_size: int, allocator_type: str):
-    """Create a sequence using the specified allocator.
-        1. Assert that after forking the sequence, the free block count is the
-            same.
-        2. Assert that the forked sequence has the same physical mappings.
-        3. Then free the original sequence; verify that the free block count is
-            the same.
-        4. Finally, free the forked sequence and verify that the free block
-            count drops to zero.
-    """
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(seq_len))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    block_table.allocate(token_ids)
-
-    num_free_blocks_before_fork = allocator.get_num_free_blocks(
-        device=Device.GPU)
-
-    forked_block_table = block_table.fork()
-
-    # Expect physical_block_ids and token_ids to match.
-    assert (block_table.physical_block_ids ==
-            forked_block_table.physical_block_ids)
-    assert block_table._get_all_token_ids(
-    ) == forked_block_table._get_all_token_ids()
-
-    # Do not expect any additional allocations.
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
-
-    # Free the original blocks. Assert num free blocks does not change, since
-    # refcount is nonzero.
-    block_table.free()
-    assert allocator.get_num_free_blocks(
-        device=Device.GPU) == num_free_blocks_before_fork
-
-    # Expect the forked block table to be unaffected by the free.
-    assert all(block_id is not None
-               for block_id in forked_block_table.physical_block_ids)
-
-    # Free the forked blocks. Assert num free blocks does change, since
-    # refcount is now zero.
-    forked_block_table.free()
-    assert allocator.get_num_free_blocks(device=Device.GPU) == num_gpu_blocks
-
-
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("appender", ["forked", "original"])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow(block_size: int, sequence_len: int, append_len: int,
-             allocator_type: str, appender: str):
-    """Fork a sequence; append to the forked sequence; verify there's a CoW.
-    """
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-
-    original_block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
-    num_expected_cow_blocks = cdiv(sequence_len + append_len,
-                                   block_size) - (sequence_len // block_size)
-
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-    original_block_ids = original_block_table.physical_block_ids[:]
-
-    print("original_block_ids = {}".format(original_block_ids))
-    forked_block_table = original_block_table.fork()
-
-    # Expect no additional allocation (copy on _write_).
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == (num_gpu_blocks - num_expected_non_cow_blocks)
-
-    if appender == "forked":
-        appender_block_table = forked_block_table
-        static_block_table = original_block_table
-    elif appender == "original":
-        appender_block_table = original_block_table
-        static_block_table = forked_block_table
-    else:
-        raise ValueError(f"unknown test config {appender=}")
-
-    # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
-
-    # Expect the non-appending block table to have no change.
-    assert static_block_table.physical_block_ids == original_block_ids
-    assert appender_block_table.physical_block_ids != original_block_ids
-
-    # Expect the blocks changed during append to have a CoW.
-    assert allocator.get_num_free_blocks(
-        Device.GPU) == num_gpu_blocks - (num_expected_non_cow_blocks +
-                                         num_expected_cow_blocks)
-
-    cows = allocator.clear_copy_on_writes()
-    if sequence_len % block_size > 0:
-        # If the last block in the sequence is not full, then when appending we
-        # expect a CoW.
-        assert cows
-
-        cow_block_id = sequence_len // block_size
-        expected_src = static_block_table.physical_block_ids[cow_block_id]
-        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
-
-        assert (expected_src, expected_dst) in cows
-    else:
-        # Otherwise, there should be no copy-on-write.
-        assert not cows
-
-    static_block_table.free()
-    appender_block_table.free()
-
-    # After free, expect all blocks to be freed.
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-
-@pytest.mark.parametrize("block_size", [8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("append_len", [1, 16, 129])
-@pytest.mark.parametrize("lookahead_slots", [1, 16, 129])
-@pytest.mark.parametrize("appender", ["forked", "original"])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_cow_lookahead_simple(block_size: int, sequence_len: int,
-                              append_len: int, lookahead_slots: int,
-                              allocator_type: str, appender: str):
-    """Similar to test_cow, except with lookahead allocation. The assertions are
-    less rigorous due to the complexity of the property under test.
-    """
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(append_len))
-
-    original_block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    original_block_table.allocate(token_ids=token_ids, device=Device.GPU)
-
-    # Allocate lookahead slots.
-    original_block_table.ensure_num_empty_slots(lookahead_slots)
-    original_block_ids = original_block_table.physical_block_ids[:]
-
-    forked_block_table = original_block_table.fork()
-
-    if appender == "forked":
-        appender_block_table = forked_block_table
-        static_block_table = original_block_table
-    elif appender == "original":
-        appender_block_table = original_block_table
-        static_block_table = forked_block_table
-    else:
-        raise ValueError(f"unknown test config {appender=}")
-
-    # Write tokens.
-    appender_block_table.append_token_ids(token_ids_to_append)
-
-    # Expect the non-appending block table to have no change.
-    assert static_block_table.physical_block_ids == original_block_ids
-    assert appender_block_table.physical_block_ids != original_block_ids
-
-    cows = allocator.clear_copy_on_writes()
-
-    # Always expect copy-on-write
-    assert cows
-
-    if sequence_len % block_size > 0:
-        # If the last block in the sequence is not full, then when appending we
-        # expect a CoW.
-        assert cows
-
-        cow_block_id = sequence_len // block_size
-        expected_src = static_block_table.physical_block_ids[cow_block_id]
-        expected_dst = appender_block_table.physical_block_ids[cow_block_id]
-
-        assert (expected_src, expected_dst) in cows
-
-    static_block_table.free()
-    appender_block_table.free()
-
-    # After free, expect all blocks to be freed.
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-
-@pytest.mark.parametrize("block_size", [1, 8])
-@pytest.mark.parametrize("sequence_len", [1, 16, 129])
-@pytest.mark.parametrize("num_new_tokens", [1, 16, 129])
-@pytest.mark.parametrize("num_lookahead_slots", [1, 7, 8])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
-                                            num_new_tokens: int,
-                                            num_lookahead_slots: int,
-                                            allocator_type: str):
-    """Verify correct calculation of get_num_blocks_touched_by_append_slots.
-
-    This is done by using copy-on-write, which requires any modified block to
-    be copied before write if the refcount > 1. We set the refcount>1 by forking
-    a sequence, then measure the free blocks before and after an append. If the
-    number of consumed blocks equals what `get_num_blocks_touched_by_append_
-    slots` returns, then the calculation is correct.
-    """
-
-    num_gpu_blocks = 1024
-
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=0,
-        block_size=block_size,
-    )
-
-    token_ids = list(range(sequence_len))
-    token_ids_to_append = list(range(num_new_tokens))
-
-    block_table = BlockTable(
-        block_size=block_size,
-        block_allocator=allocator,
-    )
-
-    block_table.allocate(token_ids=token_ids, device=Device.GPU)
-
-    # Add lookahead before fork so both sequences have the same lookahead
-    # blocks.
-    block_table.ensure_num_empty_slots(num_empty_slots=num_lookahead_slots)
-
-    # Fork sequence so that every block has refcount > 1.
-    _ = block_table.fork()
-
-    # Determine how many blocks should be touched.
-    expected_num_touched_blocks = (
-        block_table.get_num_blocks_touched_by_append_slots(
-            token_ids=token_ids_to_append,
-            num_lookahead_slots=num_lookahead_slots))
-
-    # Measure how many blocks are touched by measuring num_free_blocks before
-    # and after the append.
-    #
-    # We expect append_token_ids to CoW all mutated blocks that have refcount>1.
-    num_free_blocks_before_append = allocator.get_num_free_blocks(Device.GPU)
-    block_table.append_token_ids(token_ids_to_append, num_lookahead_slots)
-    num_consumed_blocks = (num_free_blocks_before_append -
-                           allocator.get_num_free_blocks(Device.GPU))
-
-    # TODO(cade) ensure equality when num_lookahead_slots > 0.
-    # The reason we have < is because lookahead blocks are not copied eagerly;
-    # they are copied on first write. This will cause issues for beam search +
-    # speculative decoding. This is acceptable for now as it is a large effort
-    # to combine the two. To fix this, we can ensure single sequence ownership
-    # of lookahead blocks by appending empty slots to each block, which will
-    # trigger the CoW.
-    #
-    # Until then, we can accept that the consumed tokens are <= the expected
-    # tokens when appending with lookahead.
-    if num_lookahead_slots > 0:
-        assert num_consumed_blocks <= expected_num_touched_blocks
-    else:
-        assert num_consumed_blocks == expected_num_touched_blocks
diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py
deleted file mode 100644
index 65400899b811..000000000000
--- a/tests/core/block/test_common.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-
-import pytest
-
-from vllm.core.block.common import RefCounter
-
-
-@pytest.mark.parametrize("seed", list(range(20)))
-@pytest.mark.parametrize("num_incrs", [1, 100])
-@pytest.mark.parametrize("num_blocks", [1024])
-def test_incr(seed: int, num_incrs: int, num_blocks: int):
-    random.seed(seed)
-
-    all_block_indices = list(range(num_blocks))
-    counter = RefCounter(all_block_indices=all_block_indices)
-
-    block_id = random.randint(0, num_blocks - 1)
-    for i in range(num_incrs):
-        value = counter.incr(block_id)
-        assert value == i + 1
-
-
-@pytest.mark.parametrize("seed", list(range(20)))
-@pytest.mark.parametrize("num_incrs", [1, 100])
-@pytest.mark.parametrize("num_blocks", [1024])
-def test_incr_decr(seed: int, num_incrs: int, num_blocks: int):
-    random.seed(seed)
-
-    all_block_indices = list(range(num_blocks))
-    counter = RefCounter(all_block_indices=all_block_indices)
-
-    block_id = random.randint(0, num_blocks - 1)
-    for i in range(num_incrs):
-        value = counter.incr(block_id)
-        assert value == i + 1
-
-    for i in range(num_incrs):
-        value = counter.decr(block_id)
-        assert value == num_incrs - (i + 1)
-
-    with pytest.raises(AssertionError):
-        counter.decr(block_id)
diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
deleted file mode 100644
index 795eef6743fd..000000000000
--- a/tests/core/block/test_cpu_gpu_block_allocator.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.utils import Device, chunk_list
-
-
-@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
-@pytest.mark.parametrize("num_gpu_blocks", [1024])
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_mutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                block_size: int, allocator_type: str):
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=num_cpu_blocks,
-        block_size=block_size,
-    )
-
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-    cpu_blocks = [
-        allocator.allocate_mutable_block(prev_block=None, device=Device.CPU)
-        for _ in range(num_cpu_blocks)
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-    gpu_blocks = [
-        allocator.allocate_mutable_block(prev_block=None, device=Device.GPU)
-        for _ in range(num_gpu_blocks)
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-
-    _ = [allocator.free(block) for block in cpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-
-    _ = [allocator.free(block) for block in gpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-
-@pytest.mark.parametrize("num_cpu_blocks", [0, 512])
-@pytest.mark.parametrize("num_gpu_blocks", [1024])
-@pytest.mark.parametrize("block_size", [2])
-@pytest.mark.parametrize("allocator_type", ["naive", "prefix_caching"])
-def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
-                                  block_size: int, allocator_type: str):
-    allocator = CpuGpuBlockAllocator.create(
-        allocator_type=allocator_type,
-        num_gpu_blocks=num_gpu_blocks,
-        num_cpu_blocks=num_cpu_blocks,
-        block_size=block_size,
-    )
-
-    unique_token_ids = list(
-        range((num_cpu_blocks + num_gpu_blocks) * block_size))
-    gpu_token_ids = list(
-        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
-    cpu_token_ids = list(
-        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
-
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-    cpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.CPU)
-        for token_ids in cpu_token_ids
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
-
-    gpu_blocks = [
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids,
-                                           device=Device.GPU)
-        for token_ids in gpu_token_ids
-    ]
-    assert allocator.get_num_free_blocks(Device.CPU) == 0
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-
-    _ = [allocator.free(block) for block in cpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == 0
-
-    _ = [allocator.free(block) for block in gpu_blocks]
-    assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
-    assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
deleted file mode 100644
index a31d1c46b37f..000000000000
--- a/tests/core/block/test_naive_block.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Optional
-
-import pytest
-
-from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
-
-
-class TestNaiveBlockAllocator:
-
-    @staticmethod
-    def create_allocate_lambda(allocate_type: str,
-                               allocator: NaiveBlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
-        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
-        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
-        else:
-            raise ValueError()
-
-        return allocate_block
-
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_ooms(allocate_type: str, num_blocks: int,
-                           block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-
-        [allocate_block() for _ in range(num_blocks)]
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_free_prevents_oom(allocate_type: str, num_blocks: int,
-                               block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-
-        blocks = [allocate_block() for _ in range(num_blocks)]
-
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-
-        block_to_free = blocks.pop()
-
-        for _ in range(100):
-            block_id = block_to_free.block_id
-            allocator.free(block_to_free)
-            assert block_to_free.block_id is None
-
-            new_block = allocate_block()
-            assert new_block.block_id == block_id
-
-            with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocate_block()
-
-            block_to_free = new_block
-
-    @staticmethod
-    @pytest.mark.parametrize("allocate_type", ["immutable", "mutable"])
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    def test_get_num_free_blocks(allocate_type: str, num_blocks: int,
-                                 block_size: int):
-        allocator = NaiveBlockAllocator(create_block=NaiveBlock,
-                                        num_blocks=num_blocks,
-                                        block_size=block_size)
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            allocate_type,
-            allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-
-        assert allocator.get_num_free_blocks() == num_blocks
-
-        blocks = [allocate_block() for _ in range(num_blocks)]
-
-        for i, block in enumerate(blocks):
-            assert allocator.get_num_free_blocks() == i
-            allocator.free(block)
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [4])
-    @pytest.mark.parametrize("block_size", [8])
-    def test_naive_block_get_num_full_blocks_touched(num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
-        full blocks touched.
-        """
-        allocator_src = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
-        allocator_dst = NaiveBlockAllocator(create_block=NaiveBlock,
-                                            num_blocks=num_blocks,
-                                            block_size=block_size)
-
-        # Create a chain of cacheable blocks in the dst
-        allocate_block = TestNaiveBlockAllocator.create_allocate_lambda(
-            "immutable",
-            allocator_src,
-            prev_block=None,
-            token_ids=list(range(block_size)))
-        src_blocks = [allocate_block() for _ in range(num_blocks - 1)]
-
-        # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
-
-        # Insert one non-full block in the src
-        allocate_non_full_block = \
-            TestNaiveBlockAllocator.create_allocate_lambda(
-                "mutable", allocator_src,
-                prev_block=src_blocks[-1],token_ids=[]
-            )
-        src_blocks.append(allocate_non_full_block())
-        src_blocks[-1].append_token_ids([0])
-
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks - 1
-        # Fill up the last source block and then invoke
-        # get_num_blocks_touched
-        src_blocks[-1].append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            src_blocks) == num_blocks
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
deleted file mode 100644
index 46e224c6f53b..000000000000
--- a/tests/core/block/test_prefix_caching_block.py
+++ /dev/null
@@ -1,1035 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import math
-import random
-from typing import Optional
-from unittest.mock import MagicMock
-
-import pytest
-
-from tests.core.utils import create_dummy_lora_sequence, create_dummy_sequence
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.interfaces import Block, BlockAllocator
-from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
-                                                  PrefixCachingBlock,
-                                                  PrefixCachingBlockAllocator)
-from vllm.sequence import Logprob
-from vllm.utils import Device
-
-
-class TestPrefixCachingBlock:
-
-    @staticmethod
-    @pytest.mark.parametrize("seed", list(range(10)))
-    @pytest.mark.parametrize("block_size", [1, 16])
-    @pytest.mark.parametrize("is_curr_block_full", [True, False])
-    def test_first_block_has_correct_content_hash(seed: int, block_size: int,
-                                                  is_curr_block_full: bool):
-        """Verify a block which is first in the sequence has the correct hash.
-        """
-        random.seed(seed)
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
-        token_ids = list(range(num_to_fill))
-        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-
-        block_with_prev = PrefixCachingBlock(prev_block=None,
-                                             token_ids=token_ids,
-                                             block_size=block_size,
-                                             allocator=mock_allocator)
-
-        if is_curr_block_full:
-            # Expect hash since block is full.
-            assert block_with_prev.content_hash == (
-                PrefixCachingBlock.hash_block_tokens(
-                    is_first_block=True,
-                    prev_block_hash=None,
-                    cur_block_token_ids=token_ids))
-        else:
-            # Do not expect hash since block is not full.
-            assert block_with_prev.content_hash is None
-
-    @staticmethod
-    @pytest.mark.parametrize("seed", list(range(10)))
-    @pytest.mark.parametrize("block_size", [1, 16])
-    @pytest.mark.parametrize("is_curr_block_full", [True, False])
-    @pytest.mark.parametrize("prev_block_has_hash", [True, False])
-    def test_nth_block_has_correct_content_hash(seed: int, block_size: int,
-                                                is_curr_block_full: bool,
-                                                prev_block_has_hash: bool):
-        """Verify a block which is not first in the sequence has the correct
-        hash.
-        """
-
-        random.seed(seed)
-
-        previous_block = MagicMock(spec=PrefixCachingBlock)
-        prev_block_hash = random.randint(0, 1000)
-        previous_block.content_hash = (prev_block_hash if prev_block_has_hash
-                                       else hash('None'))
-
-        num_to_fill = block_size if is_curr_block_full else random.randint(
-            0, block_size - 1)
-        token_ids = list(range(num_to_fill))
-        mock_allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-
-        block_with_prev = PrefixCachingBlock(
-            prev_block=previous_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            allocator=mock_allocator,
-        )
-
-        if is_curr_block_full and prev_block_has_hash:
-            # Expect hash since block is full and previous block has hash.
-            assert (block_with_prev.content_hash ==
-                    PrefixCachingBlock.hash_block_tokens(
-                        is_first_block=False,
-                        prev_block_hash=prev_block_hash,
-                        cur_block_token_ids=token_ids))
-        else:
-            # Do not expect hash since block is not full or the previous block
-            # does not have a hash.
-            assert block_with_prev.content_hash is None
-
-    @staticmethod
-    @pytest.mark.parametrize("block_size", [1, 2, 16])
-    @pytest.mark.parametrize("num_tokens", list(range(3)))
-    @pytest.mark.parametrize("num_empty_trailing_blocks", [0, 1, 10])
-    def test_blocks_have_correct_hash_in_chain(block_size: int,
-                                               num_tokens: int,
-                                               num_empty_trailing_blocks: int):
-        """Create two chains of logical blocks with the same contents.
-        Assert the hashes are equal.
-        """
-        random.seed(0)
-
-        token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)]
-
-        first_chain, second_chain = (TestPrefixCachingBlock.create_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            num_empty_trailing_blocks=num_empty_trailing_blocks)
-                                     for _ in range(2))
-
-        for first_chain_block, second_chain_block in zip(
-                first_chain, second_chain):
-            assert (first_chain_block.content_hash ==
-                    second_chain_block.content_hash)
-
-        if not first_chain or not second_chain:
-            assert first_chain == second_chain
-            assert num_tokens == 0
-
-    @staticmethod
-    def create_chain(block_size: int,
-                     token_ids: list[int],
-                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
-        blocks: list[PrefixCachingBlock] = []
-        num_blocks = math.ceil(
-            len(token_ids) / block_size) + num_empty_trailing_blocks
-
-        if num_blocks == 0:
-            return []
-
-        allocator = MagicMock(spec=PrefixCachingBlockAllocator)
-
-        prev_block = None
-        for block_number in range(0, num_blocks):
-            prev_block = PrefixCachingBlock(
-                prev_block=prev_block,
-                token_ids=[],
-                block_size=block_size,
-                allocator=allocator,
-            )
-
-            tokens_to_append = token_ids[block_number *
-                                         block_size:(block_number + 1) *
-                                         block_size]
-            if tokens_to_append:
-                prev_block.append_token_ids(tokens_to_append)
-
-            blocks.append(prev_block)
-
-        return blocks
-
-
-class TestPrefixCachingBlockAllocator:
-
-    @staticmethod
-    def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
-                               prev_block: Optional[Block],
-                               token_ids: list[int]):
-        if allocate_type == "immutable":
-            allocate_block = lambda: allocator.allocate_immutable_block(
-                prev_block=prev_block, token_ids=token_ids)
-        elif allocate_type == "mutable":
-            allocate_block = lambda: allocator.allocate_mutable_block(
-                prev_block=prev_block)
-        else:
-            raise ValueError()
-
-        return allocate_block
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_mutable_ooms(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
-            allocate_type="mutable",
-            allocator=allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)),
-        )
-
-        [allocate_block() for _ in range(num_blocks)]
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocate_block()
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_immutable_does_not_oom_single_hash(
-            num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        allocate_block = TestPrefixCachingBlockAllocator.create_allocate_lambda(
-            allocate_type="immutable",
-            allocator=allocator,
-            prev_block=None,
-            token_ids=list(range(block_size)),
-        )
-
-        blocks = [allocate_block() for _ in range(num_blocks)]
-
-        # Expect no OOM. If these were mutable blocks, this would OOM.
-        non_oom_block = allocate_block()
-
-        # Expect all blocks to have same physical block index.
-        for block in blocks:
-            assert (block.block_id == non_oom_block.block_id)
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_allocate_immutable_ooms_many_hash(num_blocks: int,
-                                               block_size: int):
-        """Consume all blocks using many different hashes/block content.
-
-        Do this by creating a sequence that is very long.
-        Expect next block to OOM.
-        """
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks * block_size))
-
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Expect allocation with unseen hash to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_immutable_block(prev_block=chain[-1],
-                                               token_ids=list(
-                                                   range(block_size)))
-
-        # Expect mutable allocation to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable_block(prev_block=chain[-1])
-
-        # Expect allocation of exact same chain to pass.
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Expect physical block indices to be the same in both chains.
-        assert chain and second_chain
-        for first_chain_block, second_chain_block in zip(chain, second_chain):
-            assert (first_chain_block.block_id == second_chain_block.block_id)
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1, 1024])
-    @pytest.mark.parametrize("block_size", [1, 16])
-    def test_free_prevents_oom(num_blocks: int, block_size: int):
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks * block_size))
-
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Expect mutable allocation to fail.
-        with pytest.raises(BlockAllocator.NoFreeBlocksError):
-            allocator.allocate_mutable_block(prev_block=None)
-
-        block_to_free = chain[-1]
-
-        # Expect free/allocate loop to succeed many times.
-        for i in range(100):
-            block_id = block_to_free.block_id
-            allocator.free(block_to_free)
-            assert block_to_free.block_id is None, i
-
-            new_block = allocator.allocate_mutable_block(prev_block=None)
-            assert new_block.block_id == block_id, i
-
-            with pytest.raises(BlockAllocator.NoFreeBlocksError):
-                allocator.allocate_mutable_block(prev_block=None)
-
-            block_to_free = new_block
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_num_free_blocks(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-
-        chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Free each block in chain, assert num free blocks includes new free
-        # block.
-        for i, block in enumerate(chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
-            allocator.free(block)
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [4])
-    @pytest.mark.parametrize("block_size", [8])
-    def test_prefix_caching_block_get_num_full_blocks_touched(
-            num_blocks, block_size):
-        """ Verify the allocator can correctly return the number of
-        blocks touched, when there are cached prefixes.
-        """
-        allocator_src = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
-        allocator_dst = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                    block_size=block_size)
-
-        # Create token ids that will exhaust all blocks except the last
-        token_ids = list(range((num_blocks - 1) * block_size))
-
-        # Create a chain of cacheable blocks in the dst
-        cached_blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator_dst,
-        )
-
-        # Create a chain of the same blocks in the src
-        blocks_to_swap_in = \
-            TestPrefixCachingBlockAllocator.create_immutable_chain(
-                block_size=block_size,
-                token_ids=token_ids,
-                allocator=allocator_src,
-            )
-        # All blocks are cached
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 0
-
-        # Free the first block in the dst
-        allocator_dst.free(cached_blocks[0])
-
-        # Now the first block becomes dangling, the swapped blocks need
-        # to reclaim the first block in the dst
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
-
-        # Insert one non-full block in the src
-        non_full_block = allocator_src.allocate_mutable_block(
-            blocks_to_swap_in[-1])
-        non_full_block.append_token_ids([0])
-        blocks_to_swap_in.append(non_full_block)
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 1
-        # Fill up the last mutable block and invoke get_num_blocks_touched.
-        # Note: The last block is not cached so it will be touched.
-        non_full_block.append_token_ids([0] * (block_size - 1))
-        assert allocator_dst.get_num_full_blocks_touched(
-            blocks_to_swap_in) == 2
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_num_free_blocks_shared(num_blocks: int, block_size: int,
-                                        seed: int):
-        """Verify sharing occurs by allocating two sequences that share prefixes
-        and incrementally freeing blocks.
-        """
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Free each block in the first chain. Since all blocks are shared, the
-        # free count should stay constant.
-        for i, block in enumerate(first_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume)
-            allocator.free(block)
-
-        # Free each block in the second chain. Since the refcount is now zero,
-        # the free count should increment with each free.
-        for i, block in enumerate(second_chain):
-            assert allocator.get_num_free_blocks() == (num_blocks -
-                                                       num_blocks_to_consume +
-                                                       i)
-            allocator.free(block)
-
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_get_common_computed_block_ids(num_blocks: int, block_size: int,
-                                           seed: int):
-        """Verify get_common_computed_block_ids could get correct result
-        by create two immutable chain sharing prefix at specified pos,
-        and compare whether we also could get right result
-        from get_common_computed_block_ids.
-        """
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks * 2,
-                                                block_size=block_size)
-        num_blocks_to_consume = random.randint(1, num_blocks - 1)
-
-        # Create token ids that will exhaust all blocks.
-        token_ids = list(range(num_blocks_to_consume * block_size))
-
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # After zero_point, second_chain's token_ids would be set -1, which
-        # make it different from here comparing with first_chain
-        zero_point = random.randint(1, len(token_ids) - 1)
-        zero_point_blocks = zero_point // block_size
-        token_ids[zero_point:] = [-1] * (len(token_ids) - zero_point)
-
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        first_computed_ids = [
-            first_chain[i].block_id for i in range(num_blocks_to_consume)
-        ]
-        second_computed_ids = [
-            second_chain[i].block_id for i in range(num_blocks_to_consume)
-        ]
-        res = allocator.get_common_computed_block_ids(
-            [first_computed_ids, second_computed_ids])
-
-        assert (len(res) == zero_point_blocks)
-
-    # Test case that assume those prompted block after first immutable would
-    # be freed into hashless allocator, while first immutable block get ref
-    # increased.
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [3])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(10)))
-    def test_alloc_promotion(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(block_size))
-
-        block = allocator.allocate_immutable_block(prev_block=None,
-                                                   token_ids=token_ids)
-
-        assert allocator._refcounter.get(block.block_id) == 1
-        m = allocator.allocate_mutable_block(prev_block=None)
-
-        block_id = m.block_id
-        for i in range(block_size):
-            m.append_token_ids([i])
-
-        # After block get promoted to immutable from mutable, if there is
-        # already same content hash block, then it shall be released into
-        # hashless_allocator
-        # And first immutable block's ref get increased by 1
-        assert m.block_id == block.block_id
-        assert block_id in allocator._hashless_allocator._free_block_indices
-        assert allocator._refcounter.get(block.block_id) == 2
-
-    # Test case when eviction and allocation are mixed,
-    # make sure they work as expected
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [3])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(10)))
-    def test_eviction_alloc_mixed(num_blocks: int, block_size: int, seed: int):
-        random.seed(seed)
-
-        all_blocks_list = [i for i in range(num_blocks)]
-        zero_ref = {i: 0 for i in range(num_blocks)}
-        one_ref = {i: 1 for i in range(num_blocks)}
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(num_blocks * block_size))
-
-        # Verify initial/pre-alloc state
-
-        # Ensure all blocks are free inside hashless allocator
-        assert list(allocator._hashless_allocator._free_block_indices
-                    ) == all_blocks_list
-        # Ensure no tracked blocks
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert not allocator._block_tracker[block_id].active
-        # Ensure no cached blocks
-        assert len(allocator._cached_blocks.values()) == 0
-        # Ensure no evicted blocks
-        assert len(allocator.evictor.free_table.keys()) == 0
-        # Ensure 0s ref counts for all blocks
-        assert allocator._refcounter._refcounts == zero_ref
-
-        # Allocate immutable chains with only one block residuled in
-        new_block = []
-        for i in range(num_blocks):
-            block = allocator.allocate_immutable_block(
-                prev_block=None,
-                token_ids=token_ids[block_size * i:block_size * (i + 1)])
-            new_block.append(block)
-
-        # Verify post-alloc state
-
-        # Ensure no blocks are free inside hashless allocator
-        assert (len(allocator._hashless_allocator._free_block_indices) == 0)
-        # Ensure all blocks are tracked
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert allocator._block_tracker[block_id].active
-        # Ensure all blocks are cached (all promoted)
-        assert len(allocator._cached_blocks.values()) == num_blocks
-        # Ensure no evicted blocks
-        assert len(allocator.evictor.free_table.keys()) == 0
-        # Ensure 1s ref counts for all blocks
-        assert allocator._refcounter._refcounts == one_ref
-
-        # Free all blocks, and now all blocks shall be in the evictor
-        # there shall be no tracking data left in _block_tracker
-        # all blocks shall be tracked in _cached_blocks
-        # all blocks' ref shall be zero
-        for block in new_block:
-            allocator.free(block)
-
-        # Verify post-free state
-
-        # Ensure no tracked blocks
-        assert len(allocator._block_tracker.keys()) == num_blocks
-        for block_id in range(num_blocks):
-            assert not allocator._block_tracker[block_id].active
-        # Ensure no blocks in hashless allocator (all promoted)
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        # Ensure all blocks are cached
-        assert list(allocator._cached_blocks.values()) == all_blocks_list
-        # Ensure all blocks are inside the evictor
-        assert list(allocator.evictor.free_table.keys()) == all_blocks_list
-        # Ensure 0s refcounts
-        assert allocator._refcounter._refcounts == zero_ref
-
-        # Allocate a mutable block, and the first block shall be evicted
-        # and set its content hash into None, ref to 1
-        mutable = allocator.allocate_mutable_block(prev_block=None)
-
-        assert mutable.block_id == 0
-        assert mutable.content_hash is None
-        assert allocator._block_tracker[0].active
-        assert allocator._refcounter.get(0) == 1
-        assert 0 not in allocator._cached_blocks
-        assert 0 not in allocator.evictor
-
-        # Since this mutable block has no hash yet, it shall be released into
-        # hashless allocator
-        allocator.free(mutable)
-
-        assert not allocator._block_tracker[0].active
-        assert allocator._refcounter._refcounts == zero_ref
-        assert 0 not in allocator._cached_blocks
-        assert 0 not in allocator.evictor
-        assert 0 in allocator._hashless_allocator._free_block_indices
-
-        # When allocate immutable with first block_size tokens, we
-        # shall get free block from hashless allocator, thus no block left
-        # in hashless
-        block = allocator.allocate_immutable_block(
-            prev_block=None, token_ids=token_ids[:block_size])
-
-        assert block.block_id == 0
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert allocator._block_tracker[0].active
-        assert 0 in allocator._cached_blocks.values()
-        assert allocator._refcounter.get(0) == 1
-        assert 0 not in allocator.evictor
-
-        # allocate mutable block again, it shall be popped from evictor
-        mutable = allocator.allocate_mutable_block(prev_block=None)
-        assert len(allocator._hashless_allocator._free_block_indices) == 0
-        assert mutable.block_id not in allocator.evictor.free_table
-        assert allocator._refcounter.get(mutable.block_id) == 1
-
-    # Test case where two last accessed times are equal
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [1024])
-    @pytest.mark.parametrize("block_size", [16])
-    @pytest.mark.parametrize("seed", list(range(20)))
-    def test_eviction_order(num_blocks: int, block_size: int, seed: int):
-        """This test case simulate the two chain created and free in order,
-        and together they would exhaust the initial freed blocks.
-
-        So the next block created after those two chain shall use the block
-        from the first chain as that block has long access time.
-        While first chain has two blocks, it shall pick up the last one, as
-        it has larger token number.
-        """
-
-        random.seed(seed)
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        num_blocks_to_consume = num_blocks + 1
-
-        token_ids = list(range(num_blocks_to_consume * block_size))
-
-        num_blocks_in_first_chain = 2
-        num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
-        # First chain takes the first block
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[:num_tokens_in_first_chain],
-            allocator=allocator,
-        )
-        # There should only be one block allocated at this point
-        assert allocator.get_num_free_blocks() == (num_blocks -
-                                                   num_blocks_in_first_chain)
-
-        # Set the last accessed time of the first block to 1
-        blocks_ids = [block.block_id for block in first_chain]
-        allocator.mark_blocks_as_accessed(blocks_ids, 1)
-
-        # Second chain takes the rest of the blocks
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[num_tokens_in_first_chain:-block_size],
-            allocator=allocator,
-        )
-
-        # There shouldn't be any blocks left at this point
-        assert allocator.get_num_free_blocks() == (0)
-
-        assert len(first_chain) == num_blocks_in_first_chain
-        last_block_id = first_chain[-1].block_id
-        # Free each block in the first chain.
-        for i, block in enumerate(first_chain):
-            allocator.free(block)
-
-        # Set the last accessed time on all of the blocks in the second chain
-        # to 2
-        blocks_ids = [block.block_id for block in second_chain]
-        allocator.mark_blocks_as_accessed(blocks_ids, 2)
-
-        # Free each block in the second chain.
-        for i, block in enumerate(second_chain):
-            allocator.free(block)
-
-        # Allocate a new block and check that it's the least recently used block
-        # from the first chain.
-        new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids[-block_size:],
-            allocator=allocator,
-        )
-
-        assert new_block[0].block_id == last_block_id
-
-    # Test case for cache mertics
-    @staticmethod
-    def test_metric():
-        block_size = 16
-        allocator = PrefixCachingBlockAllocator(num_blocks=4,
-                                                block_size=block_size)
-        # Test when no query (0/0)
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-
-        token_ids = list(range(block_size))
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
-        # Test 0/1 hit rate
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-
-        allocator.allocate_immutable_block(prev_block=None,
-                                           token_ids=token_ids)
-        # Test 1/2 hit rate
-        assert allocator.get_prefix_cache_hit_rate() == 0.5
-
-        # Test more than one block
-        for _ in range(2, 1005):
-            allocator.allocate_immutable_block(prev_block=None,
-                                               token_ids=token_ids)
-        assert allocator.get_prefix_cache_hit_rate() > 0.99
-
-    # Test case for marking cache hit blocks as computed right after
-    # a batch of prefill sequences are scheduled.
-    @staticmethod
-    def test_touch_block():
-        block_size = 16
-        common_blocks = 4
-        allocator = PrefixCachingBlockAllocator(num_blocks=8,
-                                                block_size=block_size)
-
-        common_token_ids = list(range(block_size * common_blocks))
-
-        # Mimic the behavior of allocating the same block chain
-        # (i.e., common prefix) for a batch of 3 different prefill sequences.
-        for _ in range(3):
-            blocks = TestPrefixCachingBlockAllocator.create_immutable_chain(
-                block_size=block_size,
-                token_ids=common_token_ids,
-                allocator=allocator,
-            )
-            block_hashes = [block.content_hash for block in blocks]
-            # The allocated blocks should  be marked as touched
-            # but not computed.
-            computed_block_ids = allocator.find_cached_blocks_prefix(
-                block_hashes)
-            assert len(computed_block_ids) == 0
-
-        allocator.mark_blocks_as_computed([])
-        computed_block_ids = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes)
-        assert len(computed_block_ids) == common_blocks
-
-    @staticmethod
-    def test_find_cached_blocks_prefix():
-        """
-        This test verifies the behavior of find_cached_blocks_prefix.
-        """
-        block_size = 4
-        num_blocks = 8
-        total_test_blocks = 12
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-
-        token_ids = list(range(total_test_blocks * block_size))
-        block_tokens_seq1 = token_ids[:num_blocks * block_size]
-        blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=block_tokens_seq1,
-            allocator=allocator,
-        )
-        block_hashes_seq1 = [block.content_hash for block in blocks_seq1]
-        allocator.mark_blocks_as_computed([])
-
-        # All blocks should be cached.
-        cached_blocks_seq1 = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks_seq1) == num_blocks
-
-        # Free the first sequence.
-        for block in blocks_seq1:
-            allocator.free(block)
-
-        # All blocks should be still be cached if not required to be allocated.
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks) == num_blocks
-
-        block_tokens_seq2 = token_ids[num_blocks * block_size:]
-        blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=block_tokens_seq2,
-            allocator=allocator,
-        )
-        block_hashes_seq2 = [block.content_hash for block in blocks_seq2]
-        allocator.mark_blocks_as_computed([])
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq2)
-        assert len(cached_blocks) == len(blocks_seq2)
-
-        # Half of the blocks from seq1 should still be cached.
-        num_evicted_blocks = len(blocks_seq2)
-        cached_blocks = allocator.find_cached_blocks_prefix(
-            block_hashes=block_hashes_seq1)
-        assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
-
-    # Test reset prefix cache
-    @staticmethod
-    @pytest.mark.parametrize("num_blocks", [10])
-    @pytest.mark.parametrize("block_size", [16])
-    def test_reset_prefix_cache(num_blocks: int, block_size: int):
-        """This test case simulates the case of resetting the prefix cache."""
-
-        allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
-                                                block_size=block_size)
-        token_ids = list(range(3 * block_size))
-
-        first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-        second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=token_ids,
-            allocator=allocator,
-        )
-
-        # Free each block in the first chain.
-        for block in first_chain:
-            allocator.free(block)
-
-        # Failed to reset prefix cache because some blocks are not freed yet.
-        assert not allocator.reset_prefix_cache()
-        assert allocator.get_prefix_cache_hit_rate() > 0.0
-
-        # Free each block in the second chain.
-        for block in second_chain:
-            allocator.free(block)
-
-        # Reset prefix cache.
-        assert allocator.reset_prefix_cache()
-        assert allocator.get_prefix_cache_hit_rate() == 0.0
-
-    @staticmethod
-    def create_immutable_chain(
-        block_size: int,
-        token_ids: list[int],
-        allocator: PrefixCachingBlockAllocator,
-        extra_hash: Optional[int] = None,
-    ) -> list[PrefixCachingBlock]:
-        """Helper method which creates a chain of blocks.
-        """
-        blocks: list[Block] = []
-        num_blocks = math.ceil(len(token_ids) / block_size)
-
-        if num_blocks == 0:
-            return []
-
-        prev_block = None
-        for block_number in range(0, num_blocks):
-            block_token_ids = token_ids[block_number *
-                                        block_size:(block_number + 1) *
-                                        block_size]
-            prev_block = allocator.allocate_immutable_block(
-                prev_block=prev_block,
-                token_ids=block_token_ids,
-                extra_hash=extra_hash)
-            blocks.append(prev_block)
-
-        return blocks
-
-
-class TestComputedBlocksTracker:
-
-    @staticmethod
-    def _get_mock_allocator():
-        return MagicMock(spec=PrefixCachingBlockAllocator)
-
-    @staticmethod
-    def test_get_num_cached_tokens():
-        """
-        Test it correctly computes the number of cached tokens for a given
-        sequence:
-
-        - The cache token count is derived from the number of cached blocks.
-        - The cache token count is updated when the allocator is updated.
-        - When a sequence is removed, the cache token count should be updated
-        accordingly.
-
-        # TODO(rickyx): This behaviour for prefill sequence is a hack until
-        we fix the computed blocks tracking.
-        - The cache token count for prefill sequence doesn't change while
-        the sequence is in continuous prefill (chunked prefill).
-        """
-        block_size = 4
-        mock_allocator = TestComputedBlocksTracker._get_mock_allocator()
-        tracker = ComputedBlocksTracker(
-            allocator=mock_allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-
-        # Not yet allocated.
-        tokens = [0, 1, 2, 3, 4, 5]
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
-        mock_allocator.find_cached_blocks_prefix.return_value = []
-        assert tracker.get_num_cached_tokens(seq1) == 0
-
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-            None
-        ]  # 1 block cached.
-        # Result is cached for prefill sequence.
-        assert tracker.get_num_cached_tokens(seq1) == 0
-
-        # Mark the sequence as non-prefill.
-        seq1.data.update_num_computed_tokens(len(tokens))  # 6 tokens computed.
-        assert not seq1.is_prefill()
-
-        # Recomputes for decoding sequence.
-        assert tracker.get_num_cached_tokens(seq1) == 4
-
-        # Append new tokens to the sequence.
-        num_new_tokens = 3
-        for i in range(num_new_tokens):
-            seq1.append_token_id(i, {i: Logprob(logprob=0.0)})
-
-        assert tracker.get_num_cached_tokens(seq1) == 4
-
-        # Update the allocator.
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-            None
-        ] * 2  # 2 blocks cached.
-        assert tracker.get_num_cached_tokens(seq1) == 8
-
-        # Remove the sequence.
-        tracker.remove_seq(seq1.seq_id)
-
-        # Re-create the sequence with the same request id to simulate recompute.
-        seq1 = create_dummy_sequence(request_id=0,
-                                     token_ids=tokens,
-                                     block_size=block_size)
-        mock_allocator.find_cached_blocks_prefix.return_value = [
-        ]  # no cached block
-        assert tracker.get_num_cached_tokens(seq1) == 0
-
-    @staticmethod
-    def test_correct_block_hash():
-        """
-        Test that the block hash is correctly computed for a sequence (should
-        match the underlying block allocator's block hash). So the number of
-        cached tokens is correctly retrieved.
-        """
-        block_size = 4
-        allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching",
-            num_gpu_blocks=16,
-            num_cpu_blocks=16,
-            block_size=block_size,
-        )
-        gpu_allocator = allocator._allocators[Device.GPU]
-
-        tracker = ComputedBlocksTracker(
-            allocator=allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-
-        tokens = list(range(block_size * 4))  # 4 blocks.
-        seq = create_dummy_sequence(request_id=0,
-                                    token_ids=tokens,
-                                    block_size=block_size)
-        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=tokens,
-            allocator=gpu_allocator,
-        )
-        allocator.mark_blocks_as_computed([])
-
-        assert tracker.get_num_cached_tokens(seq) == len(tokens)
-
-    @staticmethod
-    def test_correct_extra_hash():
-        """
-        Test that the block hash is correctly computed based on the extra hash,
-        ensuring it matches the allocator's block hash, specifically for the
-        LoRA case, and that the correct number of cached tokens is retrieved.
-        """
-        block_size = 4
-        allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching",
-            num_gpu_blocks=16,
-            num_cpu_blocks=16,
-            block_size=block_size,
-        )
-        gpu_allocator = allocator._allocators[Device.GPU]
-
-        tracker = ComputedBlocksTracker(
-            allocator=allocator,
-            block_size=block_size,
-            enable_caching=True,
-        )
-
-        tokens = list(range(block_size * 4))
-
-        # Create a dummy LoRA sequence with a specific LoRA ID.
-        lora_seq = create_dummy_lora_sequence(request_id=0,
-                                              token_ids=tokens,
-                                              block_size=block_size,
-                                              lora_int_id=1)
-
-        _ = TestPrefixCachingBlockAllocator.create_immutable_chain(
-            block_size=block_size,
-            token_ids=tokens,
-            allocator=gpu_allocator,
-            extra_hash=lora_seq.extra_hash(),
-        )
-
-        allocator.mark_blocks_as_computed([])
-
-        # Create different dummy sequences that have the same token IDs
-        # but different LoRA IDs.
-        seq = create_dummy_sequence(request_id=1,
-                                    token_ids=tokens,
-                                    block_size=block_size)
-
-        different_lora_seq = create_dummy_lora_sequence(request_id=2,
-                                                        token_ids=tokens,
-                                                        block_size=block_size,
-                                                        lora_int_id=2)
-
-        # Due to the different LoRA IDs, corresponding blocks are not cached.
-        assert tracker.get_num_cached_tokens(seq) == 0
-        assert tracker.get_num_cached_tokens(different_lora_seq) == 0
-
-        # The number of cached tokens matches the length of the tokens
-        # for the cached LoRA sequence.
-        assert tracker.get_num_cached_tokens(lora_seq) == len(tokens)
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/core/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
deleted file mode 100644
index d4dacc4f1296..000000000000
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from unittest.mock import MagicMock
-
-import pytest  # noqa
-
-from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, SequenceGroup
-
-from .utils import create_dummy_prompt
-
-
-def get_sequence_groups(scheduler_output):
-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-
-
-def append_new_token(seq_group: SequenceGroup, token_id: int):
-    for seq in seq_group.get_seqs():
-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-
-
-def schedule_and_update_computed_tokens(scheduler):
-    metas, out, _ = scheduler.schedule()
-    for s, meta in zip(out.scheduled_seq_groups, metas):
-        s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
-    return metas, out
-
-
-def test_simple():
-    """Verify basic scheduling works."""
-    block_size = 4
-    num_seq_group = 4
-    max_model_len = 16
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       num_seq_group,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Schedule seq groups prompts.
-    num_tokens = block_size * num_seq_group
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    for s in running:
-        append_new_token(s, 1)
-
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-
-
-def test_chunk():
-    """Verify prefills are chunked properly."""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 32
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Verify the second request is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    print()
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 60
-    # Verify it is chunked.
-    assert seq_group_meta[1].token_chunk_size == 4
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-
-    # One chunked prefill, and one decoding.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # The first one is prefill. Scheduler guarantees ordering.
-    assert seq_group_meta[0].token_chunk_size == 56
-    # The second one is a chunked prefill.
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 57
-
-
-def test_concurrent_chunking():
-    """Verify prefills are chunked properly when 
-    --max-num-partial-prefills is > 1"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 32
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Verify both requests are chunked with half of max_num_batched_tokens each
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 32
-    assert seq_group_meta[1].token_chunk_size == 32
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-
-    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 28
-    assert seq_group_meta[1].token_chunk_size == 28
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 56
-
-
-def test_concurrent_chunking_large_requests():
-    """Verify large prefill requests are run one at a time"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
-    cache_config.num_gpu_blocks = 3200
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i),
-            prompt_length=1200,  # Very large prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-
-    # Verify only a single request is chunked, and it gets all 64 tokens
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 64
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 64
-
-
-def test_short_prompts_jump_long_prompts_in_queue():
-    """Verify large prefill requests are punted behind smaller ones if 
-    another large prefill request is already running"""
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 2000
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
-    cache_config.num_gpu_blocks = 3200
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    long_seqs: list[SequenceGroup] = []
-    short_seqs: list[SequenceGroup] = []
-
-    # Add 2 large seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i),
-            prompt_length=1200,  # Very large prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        long_seqs.append(seq_group)
-        assert seq_group.is_prefill()
-
-    # Add 2 small seq groups behind them
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(
-            str(i + 2),
-            prompt_length=40,  # Very small prompt
-            block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        short_seqs.append(seq_group)
-        assert seq_group.is_prefill()
-
-    # Verify one large req and 1 small req chunked
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
-    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
-
-    # all 4 are prefilling
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert short_seqs[0].is_prefill()
-    assert short_seqs[1].is_prefill()
-    # First short and first long sequences have been scheduled
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
-
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-
-    # in the second iteration,
-    # the first small request had only 8 tokens left
-    # so it went to decode
-    # The other small req is scheduled
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # the new small req got 64 - (32+8) tokens
-    assert seq_group_meta[0].token_chunk_size == 24
-    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
-    # the other small request had only 8 tokens left
-    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
-
-    # The first small request got to decode now
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert not short_seqs[0].is_prefill()
-    assert short_seqs[1].is_prefill()
-    # Both small requests have started in front of the second long request
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
-
-    assert out.num_prefill_groups == 3
-    assert out.num_batched_tokens == 64
-    # the first small seq group has a new token appended.
-    append_new_token(short_seqs[0], 1)
-
-    # in the third iteration,
-    # the first small request is already decoding
-    # the second small request only has 16 tokens left and will enter decoding
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
-    # small req finished prefilling 40-24=16 tokens
-    assert seq_group_meta[1].token_chunk_size == 16
-    assert seq_group_meta[2].token_chunk_size == 1  # decode
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 49  # (32+16+1 decode)
-
-    # both small requests have now reached decode
-    assert long_seqs[0].is_prefill()
-    assert long_seqs[1].is_prefill()
-    assert not short_seqs[0].is_prefill()
-    assert not short_seqs[1].is_prefill()
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
-    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
-    assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
-    assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
-
-    # both the small seq groups have a new token appended
-    append_new_token(short_seqs[0], 1)
-    append_new_token(short_seqs[1], 1)
-
-    # in the fourth iteration, both small requests are decoding
-    # so large request gets all the budget
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-
-    # large req gets 62 tokens (minus 2 for decode)
-    assert seq_group_meta[0].token_chunk_size == 62
-    assert seq_group_meta[1].token_chunk_size == 1  # decode
-    assert seq_group_meta[2].token_chunk_size == 1  # decode
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 64
-
-    assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
-
-    # assert long_seqs[0].is_prefill()
-    # assert long_seqs[1].is_prefill()
-    # assert not short_seqs[0].is_prefill()
-    # assert not short_seqs[1].is_prefill()
-
-    # # both the small seq groups have a new token appended
-    # append_new_token(short_seqs[0], 1)
-    # append_new_token(short_seqs[1], 1)
-
-    # # in the fifth iteration, large request gets all the budget
-    # # while both small requests are decoding
-    # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    # assert seq_group_meta[0].token_chunk_size == 62
-    # assert seq_group_meta[1].token_chunk_size == 1  # decode
-    # assert seq_group_meta[2].token_chunk_size == 1  # decode
-    # assert out.num_prefill_groups == 1
-    # assert out.num_batched_tokens == 64
-
-
-def test_complex():
-    block_size = 4
-    max_seqs = 60
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 64
-    cache_config.num_gpu_blocks = 64
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-        assert seq_group.is_prefill()
-
-    # Verify the second request is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 60
-    # Verify it is chunked.
-    assert seq_group_meta[1].token_chunk_size == 4
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-
-    # Add 2 more requests.
-    for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Decoding & chunked prefill & first chunk of 3rd request is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 3
-    # The first one is the first chunked prefill.
-    assert seq_group_meta[0].token_chunk_size == 7
-    # The second one is the second new chunked prefill.
-    assert seq_group_meta[1].token_chunk_size == 56
-    # The last one is decode.
-    assert seq_group_meta[2].token_chunk_size == 1
-    # Two of them are in chunked prefill.
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 64
-    # The first 2 requests are now in decodine phase.
-    append_new_token(running[0], 1)
-    assert not running[0].is_prefill()
-    append_new_token(running[1], 1)
-    assert not running[1].is_prefill()
-    # The third request is still in prefill stage.
-    assert running[2].is_prefill()
-
-
-def test_maximal_decoding():
-    """Verify decoding requests are prioritized."""
-    block_size = 4
-    max_seqs = 2
-    max_model_len = 8
-    max_num_batched_tokens = 2
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=2,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-        assert seq_group.is_prefill()
-
-    # The first prefill is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 2
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    # Only the first seq group has a new token appended.
-    append_new_token(running[0], 1)
-
-    # Create one more seq_group.
-    _, seq_group = create_dummy_prompt("3",
-                                       prompt_length=2,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-    # The first decoding + second chunk is scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    assert running[2].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-
-    # Decoding + running prefill is prioritized.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-    append_new_token(running[1], 1)
-
-    # Only decoding is prioritized.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-    assert out.num_prefill_groups == 0
-    assert out.num_batched_tokens == 2
-    append_new_token(running[0], 1)
-    append_new_token(running[1], 1)
-
-    # After aborting the decoding request, the fcfs new prefill is prioritized.
-    scheduler.abort_seq_group(running[0].request_id)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 2
-    assert seq_group_meta[0].token_chunk_size == 1
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert not running[1].is_prefill()
-    assert running[2].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 2
-
-
-def test_prompt_limit():
-    """Verify max_num_batched_tokens < max_model_len is possible."""
-    block_size = 4
-    max_seqs = 32
-    max_model_len = 64
-    max_num_batched_tokens = 32
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=48,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-
-    # The prompt length > max_num_batched_tokens should be still scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(get_sequence_groups(out)) == 1
-    assert seq_group_meta[0].token_chunk_size == 32
-    assert running[0].is_prefill()
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == 32
-
-
-def test_prompt_limit_exceed():
-    block_size = 4
-    max_seqs = 64
-    max_model_len = 32
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True)
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-    _, seq_group = create_dummy_prompt("2",
-                                       prompt_length=48,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    assert seq_group.is_prefill()
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.ignored_seq_groups) == 1
-    assert out.ignored_seq_groups[0] == seq_group
-
-
-def test_chunked_prefill_preempt():
-    """Verify preempt works with chunked prefill requests"""
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-
-    # The request should be preempted.
-    scheduler.block_manager.can_append_slots = MagicMock()
-
-    def cannot_append_second_group1(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group1)
-
-    # The running prefill is now preempted.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 0
-    assert out.num_batched_tokens == 0
-    assert out.blocks_to_swap_out == []
-    assert out.blocks_to_swap_in == []
-
-    # Make sure we can reschedule preempted request.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-    assert seq_group.get_num_uncomputed_tokens() == 30
-
-    # We should be able to run prefill twice as it is chunked.
-    def cannot_append_second_group2(seq_group, num_lookahead_slots):
-        return True
-
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group2)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert not seq_group.is_prefill()
-    assert out.num_batched_tokens == max_num_batched_tokens
-
-
-@pytest.mark.parametrize("num_scheduler_steps", [1, 5])
-def test_chunked_prefill_spec_prefill(num_scheduler_steps):
-    """Verify that the num_lookahead_slots is set appropriately for an all"""
-    """prefill batch depending on whether multi-step scheduling is enabled"""
-    """or not"""
-    block_size = 4
-    max_seqs = 30
-    max_model_len = 200
-    max_num_batched_tokens = 30
-    num_lookahead_slots = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-        num_lookahead_slots=num_lookahead_slots,
-        num_scheduler_steps=num_scheduler_steps,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=30,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    # The request is chunked.
-    # prefill scheduled now.
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.num_prefill_groups == 1
-    assert out.num_batched_tokens == max_num_batched_tokens
-    print(out.num_lookahead_slots)
-    assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else
-                                       num_lookahead_slots)
-
-
-def test_chunked_prefill_max_seqs():
-    block_size = 4
-    max_seqs = 2
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 128
-    cache_config.num_gpu_blocks = 128
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=65,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    running.append(seq_group)
-    # The first prefill is chunked.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == max_num_batched_tokens
-    assert len(get_sequence_groups(out)) == 1
-
-    # Add new requests.
-    for i in range(4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=65,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Make sure only 2 requests are scheduled.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_batched_tokens == max_num_batched_tokens
-    assert len(get_sequence_groups(out)) == 2
-    assert not running[0].is_prefill()
-    assert running[1].is_prefill()
-    append_new_token(running[0], 1)
-
-    # Although we have enough token budget, we can only schedule max_seqs.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert seq_group_meta[0].token_chunk_size == 2
-    assert seq_group_meta[1].token_chunk_size == 1
-    assert out.num_batched_tokens == 3
-    assert len(get_sequence_groups(out)) == max_seqs
-    assert not running[0].is_prefill()
-    assert not running[1].is_prefill()
-
-
-def test_prefix_caching():
-    """Verify allocating full blocks when prefix caching is enabled."""
-    block_size = 4
-    max_seqs = 10
-    max_model_len = 80
-    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens,
-        max_seqs,
-        max_model_len,
-        enable_chunked_prefill=True,
-    )
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
-    cache_config.num_cpu_blocks = 0
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert seq_group_meta[0].token_chunk_size == 50
-    # Verify it is chunked. Note that although the budget is 64-50=14,
-    # we only allocate full blocks for prefix caching, so only 4*(14//4)=12
-    # tokens are allocated.
-    assert seq_group_meta[1].token_chunk_size == 12
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 62
-
-
-def test_prefix_caching_with_concurrent_partial_prefills():
-    """Verify allocating full blocks when prefix caching is enabled with 
-    --max-num-partial-prefills > 1."""
-    block_size = 4
-    max_seqs = 10
-    max_model_len = 8000
-    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
-    scheduler_config = SchedulerConfig("generate",
-                                       max_num_batched_tokens,
-                                       max_seqs,
-                                       max_model_len,
-                                       enable_chunked_prefill=True,
-                                       max_num_partial_prefills=2)
-    cache_config = CacheConfig(block_size,
-                               1.0,
-                               1,
-                               "auto",
-                               enable_prefix_caching=True)
-    cache_config.num_cpu_blocks = 0
-    cache_config.num_gpu_blocks = 32
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size=block_size,
-                                           prompt_length=50)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # To partially prefill both sequences, both can chunk up to 30 tokens
-    # But the next lowest multiple of the block size (4) is 28
-    assert seq_group_meta[0].token_chunk_size == 28
-    assert seq_group_meta[1].token_chunk_size == 28
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 56
-
-    # On the next iteration, both sequences should finish prefill
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    # Both sequences have 50 - 28 = 22 tokens left to prefill.
-    # This is not a multiple of the block size, but we don't care since we don't
-    # cache the final partial block of prefix sequences
-    assert seq_group_meta[0].token_chunk_size == 22
-    assert seq_group_meta[1].token_chunk_size == 22
-    assert out.num_prefill_groups == 2
-    assert out.num_batched_tokens == 44
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
-def test_chunked_prefill_with_actual_engine(model: str,
-                                            max_num_partial_prefills: int):
-    """Make sure the model can actually sample with concurrent 
-    partial prefills
-    """
-
-    prompt = "hello" * 40
-
-    engine_args = EngineArgs(
-        model=model,
-        max_num_partial_prefills=max_num_partial_prefills,
-        max_num_batched_tokens=40,
-        max_num_seqs=8,
-        enable_chunked_prefill=True,
-        gpu_memory_utilization=0.8,
-    )
-
-    engine = LLMEngine.from_engine_args(engine_args)
-    sampling_params = SamplingParams(temperature=0)
-
-    for req_num in range(max_num_partial_prefills):
-        engine.add_request(f"{req_num}", prompt, sampling_params)
-    # first step
-    request_outputs = engine.step()
-    # means all are prefilling
-    assert len(request_outputs) == 0
-    assert len(engine.scheduler[0].running) == max_num_partial_prefills
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
deleted file mode 100644
index 9e1b7913dfb9..000000000000
--- a/tests/core/test_num_computed_tokens_update.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from tests.conftest import VllmRunner
-from tests.core.utils import create_dummy_prompt
-from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
-from vllm.sequence import SequenceGroup
-
-MODEL = "JackFram/llama-160m"
-
-
-def add_seq_group_to_engine(engine: LLMEngine, seq_group: SequenceGroup):
-    scheduler = engine.scheduler[0]
-    scheduler.add_seq_group(seq_group)
-
-
-@pytest.mark.parametrize("num_scheduler_steps", [1, 8])
-@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-def test_num_computed_tokens_update(num_scheduler_steps: int,
-                                    enable_chunked_prefill: bool,
-                                    enforce_eager: bool):
-
-    is_multi_step = num_scheduler_steps > 1
-    is_multi_step_chunked_prefill = is_multi_step and enable_chunked_prefill
-
-    if is_multi_step_chunked_prefill and current_platform.is_rocm():
-        pytest.skip("Multi-step with Chunked-Prefill does not support "
-                    "rocm_flash_attn backend")
-
-    # Make a vllm engine
-    runner = VllmRunner(model_name=MODEL,
-                        gpu_memory_utilization=0.7,
-                        num_scheduler_steps=num_scheduler_steps,
-                        enable_chunked_prefill=enable_chunked_prefill,
-                        enforce_eager=enforce_eager)
-    engine: LLMEngine = runner.llm.llm_engine
-
-    # In multi-step + chunked-prefill there is no separate single prompt step.
-    # What is scheduled will run for num_scheduler_steps always.
-    num_prompt_steps = num_scheduler_steps \
-        if is_multi_step_chunked_prefill else 1
-
-    num_output_tokens_list = [4, 8, 12, 15, 16, 17]
-
-    # Create sequence and add to engine
-    prompt_len = 10
-
-    for req_idx, num_output_tokens in enumerate(num_output_tokens_list):
-        seq, seq_group = create_dummy_prompt(request_id=str(req_idx),
-                                             prompt_length=prompt_len,
-                                             min_tokens=num_output_tokens,
-                                             max_tokens=num_output_tokens)
-        add_seq_group_to_engine(engine, seq_group)
-
-        assert seq.data.get_num_computed_tokens() == 0
-
-        for _ in range(num_prompt_steps):
-            # prompt steps
-            engine.step()
-
-        if not seq.is_finished():
-            prompt_num_computed_tokens = seq.data.get_num_computed_tokens()
-            # Test correctness of num_computed_tokens after the prompt steps
-            assert prompt_num_computed_tokens == \
-                        prompt_len + num_prompt_steps - 1
-
-            decode_step_counter = 0
-            while not seq.is_finished():
-                # Test correctness of num_computed_tokens after the decode steps
-                assert seq.data.get_num_computed_tokens(
-                ) == prompt_num_computed_tokens + decode_step_counter
-                for _ in range(num_scheduler_steps):
-                    # decode step
-                    engine.step()
-                    decode_step_counter += 1
-
-        # Test correctness of num_computed_tokens after the sequence finish.
-        assert seq.data.get_num_computed_tokens(
-        ) == prompt_len + num_output_tokens - 1
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
deleted file mode 100644
index 591e1780c11c..000000000000
--- a/tests/core/test_scheduler.py
+++ /dev/null
@@ -1,1337 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import time
-from collections import deque
-from typing import Optional
-from unittest.mock import MagicMock
-
-import pytest  # noqa
-import torch
-from torch import Use  # noqa
-
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
-from vllm.core.interfaces import AllocStatus
-from vllm.core.scheduler import Scheduler, SchedulingBudget
-from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroup, SequenceStatus
-
-from .utils import (append_new_token, append_new_token_seq,
-                    append_new_token_seq_group, create_dummy_prompt,
-                    get_sequence_groups, schedule_and_update_computed_tokens)
-
-
-def test_scheduler_add_seq_group():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=1,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
-    cache_config.num_cpu_blocks = 4
-    cache_config.num_gpu_blocks = 4
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # Add seq group to scheduler.
-    num_seq_group = 4
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        assert scheduler.get_num_unfinished_seq_groups() == i + 1
-
-
-def test_scheduler_abort_seq_group():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=1,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 4
-    cache_config.num_gpu_blocks = 4
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # Add multiple seq groups to scheduler.
-    num_seq_group = 4
-    request_ids: set[str] = set()
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i), block_size)
-        scheduler.add_seq_group(seq_group)
-        request_ids.add(str(i))
-
-    # Abort all added seq groups.
-    assert scheduler.get_num_unfinished_seq_groups() == num_seq_group
-    scheduler.abort_seq_group(request_ids)
-    assert scheduler.get_num_unfinished_seq_groups() == 0
-
-
-def test_scheduler_schedule_simple():
-    block_size = 4
-    num_seq_group = 4
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=num_seq_group,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Schedule seq groups prompts.
-    num_tokens = block_size * num_seq_group
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_tokens
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    append_new_token(out, 1)
-
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set(running)
-    assert out.num_batched_tokens == num_seq_group
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == num_seq_group
-    append_new_token(out, 1)
-
-
-def test_scheduler_prefill_prioritized():
-    """Verify running batched tokens are not applied to prefill requests."""
-    block_size = 4
-    max_model_len = 30
-    max_batched_num_tokens = 30
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=max_batched_num_tokens,
-        max_num_seqs=2,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16
-    cache_config.num_gpu_blocks = 16
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # Add seq groups to scheduler.
-    _, seq_group_a = create_dummy_prompt("1", 1, block_size=block_size)
-    scheduler.add_seq_group(seq_group_a)
-
-    # Schedule seq groups prompts.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a]
-
-    # Add a new prefill request B.
-    _, seq_group_b = create_dummy_prompt("2", 30, block_size=block_size)
-    scheduler.add_seq_group(seq_group_b)
-
-    # Verify prefill requests are prioritized. Since max_batched_num_tokens
-    # is 1, new prefill request has to be scheduled first.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_b]
-
-
-def test_scheduler_schedule_preempt_abort():
-    block_size = 4
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=2,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 2
-    cache_config.num_gpu_blocks = 2
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # Add seq groups to scheduler.
-    seq_a, seq_group_a = create_dummy_prompt("1",
-                                             block_size,
-                                             block_size=block_size)
-    seq_b, seq_group_b = create_dummy_prompt("2",
-                                             block_size,
-                                             block_size=block_size)
-    scheduler.add_seq_group(seq_group_a)
-    scheduler.add_seq_group(seq_group_b)
-
-    # Schedule seq groups prompts.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a, seq_group_b]
-    assert out.num_batched_tokens == block_size * 2  # seq_a and seq_b
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 2
-    assert scheduler.get_num_unfinished_seq_groups() == 2
-
-    # Append "generated" tokens, allowing the sequence to mark prompt tokens as
-    # processed.
-    append_new_token(out, 1)
-
-    # Schedule seq groups generation and preempt seq group b.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_a]
-    assert out.num_batched_tokens == 1
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 1
-    assert scheduler.get_num_unfinished_seq_groups() == 2
-    assert out.preempted == 1
-
-    # Abort seq group a. Re-schedule seq group b prompt with recomputation.
-    scheduler.abort_seq_group("1")
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert get_sequence_groups(out) == [seq_group_b]
-    assert out.num_batched_tokens == 5  # 4 prompt + 1 generation.
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    assert len(seq_group_meta) == 1
-    assert scheduler.get_num_unfinished_seq_groups() == 1
-
-
-def test_scheduler_max_seqs():
-    block_size = 4
-    num_seq_group = 4
-    max_seq_group = 2
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=max_seq_group,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    all_seq_groups: list[SequenceGroup] = []
-    # Add seq groups to scheduler.
-    for i in range(num_seq_group):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=block_size,
-                                           block_size=block_size)
-        all_seq_groups.append(seq_group)
-
-    # Append 1 seq group
-    scheduler.add_seq_group(all_seq_groups[0])
-
-    # Schedule seq groups prompts.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
-    append_new_token(out, 1)
-
-    # Schedule seq groups generation.
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[0]])
-    append_new_token(out, 1)
-
-    # Append 2 more seq group
-    scheduler.add_seq_group(all_seq_groups[1])
-    scheduler.add_seq_group(all_seq_groups[2])
-
-    # Schedule seq groups prompts.
-    # Only 1 seq group should be scheduled since max_seq_group is 2
-    # and one is prompting.
-    _, out = schedule_and_update_computed_tokens(scheduler)
-    assert set(get_sequence_groups(out)) == set([all_seq_groups[1]])
-
-
-def test_scheduler_delay_factor():
-    block_size = 4
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=100,
-        max_num_seqs=64,
-        max_model_len=16,
-        delay_factor=0.5,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 8
-    cache_config.num_gpu_blocks = 8
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-
-    # schedule first prompt
-    seq_group_meta, seq_group = create_dummy_prompt("0",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '0'
-    append_new_token(out, 1)
-
-    # wait for a second before scheduling next prompt
-    time.sleep(1)
-    seq_group_meta, seq_group = create_dummy_prompt("1",
-                                                    prompt_length=block_size,
-                                                    block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-
-    # second prompt should *not* be scheduled
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups == 0
-    assert seq_group_meta[0].request_id == '0'
-    append_new_token(out, 1)
-
-    # wait for more than 0.5 second and try again
-    time.sleep(0.6)
-    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
-    assert out.num_prefill_groups > 0
-    assert seq_group_meta[0].request_id == '1'
-    append_new_token(out, 1)
-
-
-def initialize_scheduler(
-    *,
-    max_num_seqs=1000,
-    max_token_budget=1000,
-    max_model_len=1000,
-    lora_config=None,
-    block_size=4,
-    num_cpu_blocks=8,
-    num_gpu_blocks=8,
-    enable_prefix_caching=False,
-    enable_chunked_prefill=False,
-):
-    block_size = block_size
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=max_token_budget,
-        max_num_seqs=max_num_seqs,
-        max_model_len=max_model_len,
-        enable_chunked_prefill=enable_chunked_prefill,
-    )
-    cache_config = CacheConfig(
-        block_size,
-        1.0,
-        1,
-        "auto",
-        enable_prefix_caching=enable_prefix_caching,
-    )
-    cache_config.num_cpu_blocks = num_cpu_blocks
-    cache_config.num_gpu_blocks = num_gpu_blocks
-    scheduler = Scheduler(scheduler_config, cache_config, lora_config)
-    return scheduler
-
-
-def create_token_budget(token_budget: int = 10000,
-                        max_num_seqs: int = 10000) -> SchedulingBudget:
-    return SchedulingBudget(
-        token_budget=token_budget,
-        max_num_seqs=max_num_seqs,
-    )
-
-
-def add_token_budget(budget: SchedulingBudget,
-                     num_batched_tokens: int = 0,
-                     num_curr_seqs: int = 0):
-    mock_seq_group = create_dummy_prompt('10', prompt_length=60)[1]
-    budget.add_num_batched_tokens(mock_seq_group.request_id,
-                                  num_batched_tokens)
-    budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs)
-
-
-def test_prefill_schedule_max_prompt_len():
-    """
-    Test prompt longer than max_prompt_len is aborted.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(max_model_len=30, block_size=block_size)
-    _, seq_group = create_dummy_prompt("0",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    budget = create_token_budget()
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 1
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 0
-
-
-def test_prefill_schedule_token_budget():
-    """
-    Test token budget respected.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(token_budget=0)
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-
-    # 0 token budget == nothing is scheduled.
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 2
-
-    # 60 token budget == 1 request scheduled.
-    budget = create_token_budget(token_budget=60)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 1
-    assert budget.num_batched_tokens == 60
-    assert budget.num_curr_seqs == 1
-    assert len(remaining_waiting) == 1
-
-    # Test when current_batched_tokens respected.
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
-    budget = create_token_budget(token_budget=60)
-    add_token_budget(budget, 30, 0)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
-    # Cannot schedule a prompt that doesn't fit the budget.
-    scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 30
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 1
-    budget = create_token_budget(token_budget=90)
-    add_token_budget(budget, 30, 0)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.seq_groups) == 1
-    assert budget.num_batched_tokens == 90
-    assert budget.num_curr_seqs == 1
-    assert len(remaining_waiting) == 0
-
-
-def test_prefill_schedule_max_seqs():
-    """
-    Test max seq respected.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(max_num_seqs=2)
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 2
-    assert budget.num_batched_tokens == 120
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 1
-
-    # Verify curr_num_seqs respected.
-    scheduler.waiting = deque()
-    budget = create_token_budget(max_num_seqs=2)
-    add_token_budget(budget, 0, 2)
-    _, seq_group = create_dummy_prompt(str(i),
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler.add_seq_group(seq_group)
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 1
-
-
-def test_prefill_schedule_max_lora():
-    """
-    Test max lora is respected and prioritized.
-    """
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    budget = create_token_budget(token_budget=120)
-    curr_loras: set[int] = set()
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler.add_seq_group(seq_group)
-    # Add two more requests to verify lora is prioritized.
-    # 0: LoRA, 1: LoRA, 2: regular, 3: regular
-    # In the first iteration, index 0, 2 is scheduled.
-    # If a request is not scheduled because it hits max lora, it is
-    # prioritized. Verify that.
-    for i in range(2, 4):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    # Schedule 2 requests (0 and 2)
-    output = scheduler._schedule_prefills(budget, curr_loras)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 2
-    assert budget.num_batched_tokens == 120
-    assert budget.num_curr_seqs == 2
-    assert len(remaining_waiting) == 2
-    assert len(curr_loras) == 1
-    # The second lora request is scheduled next as FCFS policy.
-    # Reset curr_loras so that it can be scheduled.
-    curr_loras = set()
-    budget = create_token_budget(token_budget=60)
-    output = scheduler._schedule_prefills(budget, curr_loras)
-    remaining_waiting = scheduler.waiting
-    assert len(output.seq_groups) == 1
-    assert output.seq_groups[0].seq_group.request_id == "1"
-    assert len(remaining_waiting) == 1
-    assert len(curr_loras) == 1
-    assert budget.num_batched_tokens == 60
-
-
-def test_prefill_schedule_no_block_manager_capacity():
-    """
-    Test sequence cannot be scheduled due to block manager has no capacity.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_gpu_blocks=128,
-                                     num_cpu_blocks=128)
-    budget = create_token_budget()
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    scheduler.block_manager.can_allocate = MagicMock()
-    scheduler.block_manager.can_allocate.return_value = AllocStatus.LATER
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 0
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 3
-
-    scheduler = initialize_scheduler()
-    budget = create_token_budget()
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler.add_seq_group(seq_group)
-    scheduler.block_manager.can_allocate = MagicMock()
-    scheduler.block_manager.can_allocate.return_value = AllocStatus.NEVER
-    output = scheduler._schedule_prefills(budget, None)
-    remaining_waiting = scheduler.waiting
-    assert len(output.ignored_seq_groups) == 3
-    assert len(output.seq_groups) == 0
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(remaining_waiting) == 0
-
-
-def test_decode_schedule_preempted():
-    """
-    Test decodes cannot be scheduled and preempted.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64)
-    curr_loras = None
-    for i in range(3):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._add_seq_group_to_running(seq_group)
-    scheduler.block_manager.can_append_slots = MagicMock()
-
-    def cannot_append_second_group(seq_group, num_lookahead_slots):
-        return seq_group.request_id != "1"
-
-    scheduler.block_manager.can_append_slots.side_effect = (
-        cannot_append_second_group)
-
-    # 1 cannot be scheduled, and the lowest priority (request 2)
-    # should be preempted. 1 will also be preempted.
-    budget = create_token_budget()
-    output = scheduler._schedule_running(budget, curr_loras)
-    remaining_running = scheduler.running
-    assert len(remaining_running) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert output.decode_seq_groups[0].seq_group.request_id == "0"
-    assert len(output.preempted) == 2
-    # Verify budgets are updated.
-    assert budget.num_batched_tokens == 1
-    # NOTE: When enable_chunk is False, num_seqs budget is not updated.
-    # assert budget.num_curr_seqs == 1
-    # Both should be preempted, not swapped.
-    assert output.blocks_to_swap_out == []
-    # Nothing is copied.
-    assert output.blocks_to_copy == []
-
-
-def test_schedule_decode_blocks_to_copy_update():
-    """
-    Verify blocks_to_copy is updated.
-    """
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=4,
-                                     num_cpu_blocks=16,
-                                     num_gpu_blocks=16)
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    curr_loras = None
-    scheduler._allocate_and_set_running(seq_group)
-    append_new_token_seq_group(60, seq_group, 1)
-    scheduler._add_seq_group_to_running(seq_group)
-
-    # The last request should be swapped out.
-    scheduler.block_manager.append_slots = MagicMock()
-    scheduler.block_manager.append_slots.return_value = [(2, 3)]
-
-    budget = create_token_budget()
-    output = scheduler._schedule_running(budget, curr_loras)
-    remaining_running = scheduler.running
-    assert len(remaining_running) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert len(output.preempted) == 0
-    assert len(output.swapped_out) == 0
-    # Nothing is preempted.
-    assert output.blocks_to_swap_out == []
-    # Since append_slot returns the source -> dist mapping, it should
-    # applied.
-    assert output.blocks_to_copy == [(2, 3)]
-
-
-def test_schedule_swapped_max_loras():
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras: set[int] = set()
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 1
-    assert budget.num_batched_tokens == 1
-    assert budget.num_curr_seqs == 1
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert len(curr_loras) == 1
-
-
-def test_schedule_swapped_cannot_swap_in():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-
-    # The last request should be swapped out.
-    scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER
-    # Since we cannot swap in, none of the requests are swapped in.
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 2
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
-
-
-def test_infeasible_swap():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    for i in range(2):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=60,
-                                           block_size=block_size)
-        scheduler._allocate_and_set_running(seq_group)
-        append_new_token_seq_group(60, seq_group, 1)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-
-    # The last request should be swapped out.
-    scheduler.block_manager.can_swap_in = MagicMock()
-    scheduler.block_manager.can_swap_in.return_value = AllocStatus.NEVER
-    # Since we cannot swap in, none of the requests are swapped in.
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 0
-    assert len(output.infeasible_seq_groups) == 2
-    assert budget.num_batched_tokens == 0
-    assert budget.num_curr_seqs == 0
-    assert len(output.decode_seq_groups) == 0
-    assert len(output.prefill_seq_groups) == 0
-
-
-def test_schedule_swapped_blocks_to_copy():
-    block_size = 4
-    scheduler = initialize_scheduler(block_size=block_size,
-                                     num_cpu_blocks=32,
-                                     num_gpu_blocks=32)
-    curr_loras = None
-    _, seq_group = create_dummy_prompt("1",
-                                       prompt_length=60,
-                                       block_size=block_size)
-    scheduler._allocate_and_set_running(seq_group)
-    append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    scheduler._swap_out(seq_group, blocks_to_swap_out)
-    scheduler._add_seq_group_to_swapped(seq_group)
-
-    # The last request should be swapped out.
-    scheduler.block_manager.append_slots = MagicMock()
-    scheduler.block_manager.append_slots.return_value = [(2, 3)]
-
-    budget = create_token_budget()
-    output = scheduler._schedule_swapped(budget, curr_loras)
-    remaining_swapped = scheduler.swapped
-    assert len(remaining_swapped) == 0
-    assert len(output.decode_seq_groups) == 1
-    assert len(output.prefill_seq_groups) == 0
-    assert output.blocks_to_copy == [(2, 3)]
-
-
-def test_scheduling_budget():
-    TOKEN_BUDGET = 4
-    MAX_SEQS = 4
-    budget = SchedulingBudget(token_budget=TOKEN_BUDGET, max_num_seqs=MAX_SEQS)
-    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=1)
-    assert budget.can_schedule(num_new_tokens=4, num_new_seqs=4)
-    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=5)
-    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=1)
-    assert not budget.can_schedule(num_new_tokens=5, num_new_seqs=5)
-    assert budget.remaining_token_budget() == TOKEN_BUDGET
-
-    # Verify add/subtract num batched tokens.
-    _, seq_group = create_dummy_prompt("1", 3)
-    budget.add_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 2
-    assert budget.num_batched_tokens == 2
-    assert budget.can_schedule(num_new_tokens=2, num_new_seqs=1)
-    assert not budget.can_schedule(num_new_tokens=3, num_new_seqs=1)
-    # Verify adding another seq group is no-op.
-    budget.add_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 2
-    assert budget.num_batched_tokens == 2
-    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 4
-    assert budget.num_batched_tokens == 0
-    budget.subtract_num_batched_tokens(seq_group.request_id, 2)
-    assert budget.remaining_token_budget() == 4
-    assert budget.num_batched_tokens == 0
-
-    # Verify add/subtract max seqs.
-    _, seq_group = create_dummy_prompt("1", 3)
-    budget.add_num_seqs(seq_group.request_id, 2)
-    assert budget.can_schedule(num_new_tokens=1, num_new_seqs=2)
-    assert not budget.can_schedule(num_new_tokens=1, num_new_seqs=3)
-    assert budget.num_curr_seqs == 2
-    # Verify adding another seq group is no-op.
-    budget.add_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 2
-    budget.subtract_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 0
-    budget.subtract_num_seqs(seq_group.request_id, 2)
-    assert budget.num_curr_seqs == 0
-
-
-@pytest.mark.parametrize("enable_prefix_caching", [True, False])
-def test_prefix_caching_aware_prefills(enable_prefix_caching):
-    """
-    Test the below scenario:
-
-    For 3 sequences, seqA, seqB, seqC, share the first block as prefix.
-
-    The test verifies the below scenarios:
-    1.  SeqA is first scheduled.
-    2.  SeqB and SeqC can be prefilled together in a single schedule round
-    even though there are not enough token budgets to prefill both without
-    considering prefix caching.
-    """
-
-    block_size = 4
-    max_num_batched_tokens = 12
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_token_budget=max_num_batched_tokens,
-        max_num_seqs=max_seq_group,
-        max_model_len=max_num_batched_tokens,
-        enable_prefix_caching=enable_prefix_caching,
-    )
-
-    seqA_tokens = list(range(8))
-    num_shared_tokens = 4
-    seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        12, 16))  # Shared prefix first 4.
-    seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range(
-        16, 20))  # Shared prefix first 4.
-
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
-
-    # Schedule seqA prefill.
-    scheduler.add_seq_group(seqA_group)
-    metas, out, _ = scheduler.schedule()
-    assert (len(out.scheduled_seq_groups) == 1
-            and out.scheduled_seq_groups[0].seq_group == seqA_group)
-    assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens)
-
-    # Schedule seqA decode.
-    append_new_token_seq_group(len(seqA_tokens), seqA_group, 999)
-    metas, out, _ = scheduler.schedule()
-
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 1
-
-    # Schedule seqB and seqC prefills should work with prefix caching.
-    scheduler.add_seq_group(seqB_group)
-    scheduler.add_seq_group(seqC_group)
-    metas, out, _ = scheduler.schedule()
-
-    if enable_prefix_caching:
-        assert len(out.scheduled_seq_groups) == 2
-        assert set([
-            out.scheduled_seq_groups[0].seq_group,
-            out.scheduled_seq_groups[1].seq_group,
-        ]) == set([seqB_group, seqC_group])
-        assert len(metas) == 2
-        for meta in metas:
-            assert meta.token_chunk_size == 8
-            assert (len(meta.computed_block_nums) == num_shared_tokens //
-                    block_size)  # 1 Block for the 8 tokens.
-    else:
-        assert len(out.scheduled_seq_groups) == 1
-        assert len(metas) == 1
-        assert metas[0].token_chunk_size == 8
-        assert len(metas[0].computed_block_nums) == 0  # No blocks computed.
-
-
-def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching(
-):
-    """
-    This test verifies that we don't schedule new prefills if there's already
-    a continuous prefill in progress even though the new prefills with shared
-    prefix can fit in the token budget:
-
-    - SeqA is being chunked prefill.
-    - SeqB with the same prompt shouldn't be scheduled for prefill even though
-    there's enough token budget to prefill the cached tokens.
-    - Neither should seqC be scheduled.
-
-    - When seqA is in decoding phase, seqB and seqC can be scheduled.
-        - Entire seqB should be prefilled since it's a full prefix cache hit.
-        - SeqC would be partially prefilled with the prefix shared, and the
-        remaining unique tokens would be prefilled (rounded down to be
-        block-size aligned).
-    """
-
-    block_size = 2
-    max_num_batched_tokens = 4
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_token_budget=max_num_batched_tokens,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-        enable_chunked_prefill=True,
-    )
-
-    seqA_tokens = list(range(8))
-    seqB_tokens = seqA_tokens
-    seqC_shared_prefix_len = 4
-    seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20))
-
-    seqA, seqA_group = create_dummy_prompt("0",
-                                           prompt_tokens=seqA_tokens,
-                                           block_size=block_size)
-    seqB, seqB_group = create_dummy_prompt("1",
-                                           prompt_tokens=seqB_tokens,
-                                           block_size=block_size)
-
-    # Chunked prefill seqA.
-    scheduler.add_seq_group(seqA_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 4
-
-    # seqB should not be scheduled with ongoing prefills.
-    scheduler.add_seq_group(seqB_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 1
-    assert out.scheduled_seq_groups[0].seq_group == seqA_group
-    assert out.scheduled_seq_groups[0].token_chunk_size == 4
-
-    # both seqB and seqC can now be scheduled with seqA is over.
-    # seqA is in decoding phase.
-    append_new_token_seq(seqA, 999)
-    seqC, seqC_group = create_dummy_prompt("2",
-                                           prompt_tokens=seqC_tokens,
-                                           block_size=block_size)
-    scheduler.add_seq_group(seqC_group)
-    metas, out = schedule_and_update_computed_tokens(scheduler)
-    assert len(out.scheduled_seq_groups) == 3
-
-    metas = {meta.request_id: meta for meta in metas}
-    assert metas[seqA_group.request_id].token_chunk_size == 1  # Decode
-    assert (metas[seqB_group.request_id].token_chunk_size == 8
-            )  # Fully cached prefill
-    assert (
-        metas[seqC_group.request_id].token_chunk_size == 6
-    ), "A partial prefix of C (4 tokens) should be prefilled, with the "
-    "remaining tokens fit into 3 token budget (4-1 from the seqA). It will "
-    "then be rounded down to 2 tokens on block size, thus 6 tokens in total."
-
-
-def test_no_batches_mixed_with_prompt_tokens_and_prompt_embeds():
-    """
-    Test that the scheduler does not schedule batches with prompt tokens and 
-    prompt embeddings co-mingled.
-    """
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-    )
-
-    # the odd indexed inputs should be passed in via embeddings,
-    # evens via token_ids
-    seq_length = 7
-    embedding_size = 5
-    num_seqs = 11
-    seq_tokens: list[list[int]] = []
-    seq_embeds: list[Optional[torch.Tensor]] = []
-    for i in range(num_seqs):
-        if i % 2:
-            seq_tokens.append(list(range(seq_length)))
-            seq_embeds.append(None)
-        else:
-            seq_tokens.append([0] * seq_length)
-            seq_embeds.append(torch.rand(embedding_size))
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    while not all(seq.is_finished() for seq, _ in seq_and_seq_groups):
-        unfinished_seq_groups = [
-            seq_group for _, seq_group in seq_and_seq_groups
-            if not seq_group.is_finished()
-        ]
-        _, out = schedule_and_update_computed_tokens(scheduler)
-        assert len(out.scheduled_seq_groups) > 0
-        batch_is_prompt_embeds = out.scheduled_seq_groups[
-            0].seq_group.uses_prompt_embeds()
-        expected_scheduled_seq_groups = [
-            seq_group for seq_group in unfinished_seq_groups
-            if seq_group.uses_prompt_embeds() == batch_is_prompt_embeds
-        ]
-
-        # We should have as many scheduled groups as possible, without mixing
-        assert len(out.scheduled_seq_groups) == min(
-            max_seq_group, len(expected_scheduled_seq_groups))
-        assert all(scheduled_seq_group.seq_group.uses_prompt_embeds() ==
-                   batch_is_prompt_embeds
-                   for scheduled_seq_group in out.scheduled_seq_groups)
-
-        # Finish the scheduled groups
-        for scheduled_seq_group in out.scheduled_seq_groups:
-            for seq in scheduled_seq_group.seq_group.seqs:
-                seq.status = SequenceStatus.FINISHED_STOPPED
-        scheduler.free_finished_seq_groups()
-
-
-def test_remove_seq_from_computed_blocks_tracker():
-    """
-    Test that computed_blocks_tracker correctly removes stale sequences
-    during scheduling.
-
-    The test covers 9 scheduling branches where stale seqs are removed:
-    - 1 in _schedule_swapped
-    - 1 in _schedule_priority_preemption
-    - 7 in _schedule_prefill
-
-    Each branch is tested to ensure proper cleanup of
-    _seq_id_to_num_tokens_computed.
-    """
-    # Budget can not schedule in swapped
-    block_size = 2
-    max_seq_group = 3
-    seq_tokens_with_swapped: list[list[int]] = []
-    blocks_to_swap_out: list[tuple[int, int]] = []
-    curr_loras: set[int] = set()
-
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=64,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        enable_prefix_caching=True,
-    )
-    budget = create_token_budget(token_budget=15)
-
-    seq_length = 16
-    num_seqs = 3
-    for i in range(num_seqs):
-        seq_tokens_with_swapped.append([i] * seq_length)
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_swapped[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_with_swapped))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler._allocate_and_set_running(seq_group)
-        scheduler._swap_out(seq_group, blocks_to_swap_out)
-        scheduler._add_seq_group_to_swapped(seq_group)
-
-    scheduler._schedule_swapped(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Prefill schedule don't have a space for another LoRA, so
-    # we ignore this request for now.
-    block_size = 4
-    lora_config = LoRAConfig(max_lora_rank=8, max_loras=1)
-    scheduler = initialize_scheduler(lora_config=lora_config,
-                                     block_size=block_size,
-                                     num_cpu_blocks=64,
-                                     num_gpu_blocks=64,
-                                     enable_prefix_caching=True)
-    budget = create_token_budget(token_budget=120)
-    num_seqs = 2
-    for i in range(num_seqs):
-        _, seq_group = create_dummy_prompt(str(i),
-                                           prompt_length=seq_length,
-                                           block_size=block_size,
-                                           lora_request=LoRARequest(
-                                               lora_name=str(i),
-                                               lora_int_id=i + 1,
-                                               lora_path="abc"))
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_prefills(budget, curr_loras)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Priority preemption schedule
-    scheduler._schedule_priority_preemption(budget)
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Prefill scheduler does not schedule batches with prompt tokens and
-    # prompt embeddings co-mingled.
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=100,
-        enable_prefix_caching=True,
-    )
-    seq_length = 7
-    embedding_size = 5
-    seq_tokens_with_embedding: list[list[int]] = []
-    seq_embeds: list[Optional[torch.Tensor]] = []
-
-    seq_tokens_with_embedding.append(list(range(seq_length)))
-    seq_embeds.append(None)
-    seq_tokens_with_embedding.append([0] * seq_length)
-    seq_embeds.append(torch.rand(embedding_size))
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_with_embedding[i],
-                            prompt_embeds=seq_embeds[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_with_embedding))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-
-    #  Prefill scheduler budget num_batched_tokens
-    #  >= scheduler_config max_num_batched_tokens
-    block_size = 2
-    max_seq_group = 3
-    seq_tokens_prefill_budget: list[list[int]] = []
-
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        max_token_budget=8,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=5,
-        enable_prefix_caching=True,
-    )
-    seq_length = 4
-    num_seqs = 3
-    for i in range(num_seqs):
-        seq_tokens_prefill_budget.append([i] * seq_length)
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_prefill_budget))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(2))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Budget can not schedule in waiting
-    block_size = 2
-    max_seq_group = 3
-
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        max_token_budget=30,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=30,
-        enable_prefix_caching=True,
-    )
-    seq_length = 16
-    num_seqs = 3
-    seq_tokens_prefill_budget_waiting: list[list[int]] = []
-
-    for i in range(num_seqs):
-        seq_tokens_prefill_budget_waiting.append(list(range(seq_length)))
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_prefill_budget_waiting[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_prefill_budget_waiting))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Sequence num_new_tokens > prompt_limit marked FINISHED_IGNORED
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=16,
-        num_gpu_blocks=16,
-        max_num_seqs=max_seq_group,
-        max_model_len=30,
-        enable_prefix_caching=True,
-    )
-
-    seq_length = 31
-    seq_tokens_prompt_limit: list[list[int]] = []
-    seq_tokens_prompt_limit.append(list(range(seq_length)))
-    seq_and_seq_groups = [
-        create_dummy_prompt("0",
-                            prompt_tokens=seq_tokens_prompt_limit[0],
-                            block_size=block_size)
-    ]
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Budget can not allocate, AllocStatus is NEVER marked FINISHED_IGNORED
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=160,
-        num_gpu_blocks=160,
-        max_num_seqs=max_seq_group,
-        max_model_len=320,
-        enable_prefix_caching=True,
-    )
-
-    seq_length = 320
-    num_seqs = 1
-    seq_tokens_never: list[list[int]] = []
-    for i in range(num_seqs):
-        seq_tokens_never.append(list(range(seq_length)))
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_never[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_never))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(0))
-    assert seq_id_to_num_tokens_computed is None
-
-    # Budget can not allocate, AllocStatus is LATER
-    block_size = 2
-    max_seq_group = 3
-    scheduler = initialize_scheduler(
-        block_size=block_size,
-        num_cpu_blocks=160,
-        num_gpu_blocks=160,
-        max_num_seqs=max_seq_group,
-        max_model_len=320,
-        enable_prefix_caching=True,
-    )
-
-    seq_length = 160
-    num_seqs = 2
-    seq_tokens_later: list[list[int]] = []
-    for i in range(num_seqs):
-        seq_tokens_later.append(list(range(seq_length)))
-
-    seq_and_seq_groups = [
-        create_dummy_prompt(f"{i}",
-                            prompt_tokens=seq_tokens_later[i],
-                            block_size=block_size)
-        for i in range(len(seq_tokens_later))
-    ]
-
-    for _, seq_group in seq_and_seq_groups:
-        scheduler.add_seq_group(seq_group)
-
-    scheduler._schedule_default()
-    seq_id_to_num_tokens_computed = (
-        scheduler.block_manager._computed_blocks_tracker.
-        _seq_id_to_num_tokens_computed.get(1))
-    assert seq_id_to_num_tokens_computed is None
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
deleted file mode 100644
index 20cc083ec8db..000000000000
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest  # noqa
-
-from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.sequence import SequenceGroup
-
-from .utils import (append_new_token, create_dummy_prompt_encoder_decoder,
-                    get_sequence_groups, schedule_and_update_computed_tokens)
-
-
-def test_scheduler_schedule_simple_encoder_decoder():
-    '''
-    Test basic scheduler functionality in the context
-    of an encoder/decoder model. Focus on testing
-    enc/dec-specific functionality sense tests already
-    exist for decoder-only functionality
-
-    Test behavior:
-    * Construct Scheduler
-    * Construct dummy encoder/decoder sequence groups
-    * Add dummy seq groups to scheduler backlog
-    * Schedule the next seq group & validate:
-        * Cross-attn block tables
-        * Updated states of seq groups
-        * Number of batched tokens
-        * Number of blocks to copy/swap-in/swap-out
-        * Number of scheduled seq groups
-    * Repeat for both prefill- and decode-phase
-    * Abort scheduled seq groups
-    * Assert that aborted seq groups no longer appear in
-      cross-attention block table
-    '''
-
-    block_size = 4
-    num_seq_group = 4
-    max_model_len = 16
-    scheduler_config = SchedulerConfig(
-        "generate",
-        max_num_batched_tokens=64,
-        max_num_seqs=num_seq_group,
-        max_model_len=max_model_len,
-    )
-    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
-    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
-    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
-    scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: list[SequenceGroup] = []
-
-    # Add seq groups to scheduler.
-    req_id_list = []
-    for i in range(num_seq_group):
-        req_id = str(i)
-        req_id_list.append(req_id)
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            req_id, block_size, block_size, block_size)
-        scheduler.add_seq_group(seq_group)
-        running.append(seq_group)
-
-    # Schedule seq groups prefill.
-    num_tokens = block_size * num_seq_group
-    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
-    # - Verify that sequence group cross-attention block tables are
-    #   registered with the block manager
-    assert all([(req_id in scheduler.block_manager.cross_block_tables)
-                for req_id in req_id_list])
-    # - Validate sequence-group status
-    assert set(get_sequence_groups(out)) == set(running)
-    # - Validate number of batched tokens
-    assert out.num_batched_tokens == num_tokens
-    # - Validate there are no remaining blocks to swap
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    # - Validate all seq groups were scheduled
-    assert len(seq_group_meta_list) == num_seq_group
-    append_new_token(out, 1)
-
-    # Schedule seq groups decode.
-    seq_group_meta_list, out = schedule_and_update_computed_tokens(scheduler)
-    # - Verify that sequence group metadata includes encoder attention
-    #   and cross-attention metadata
-    assert all([
-        not ((seq_group_meta.encoder_seq_data is None) or
-             (seq_group_meta.cross_block_table is None))
-        for seq_group_meta in seq_group_meta_list
-    ])
-    # - Validate sequence-group status
-    assert set(get_sequence_groups(out)) == set(running)
-    # - Validate there is one batched token per seq group
-    assert out.num_batched_tokens == num_seq_group
-    # - Validate there are no remaining blocks to swap
-    assert (not out.blocks_to_copy and not out.blocks_to_swap_in
-            and not out.blocks_to_swap_out)
-    # - Validate that all seq groups were scheduled
-    assert len(seq_group_meta_list) == num_seq_group
-    append_new_token(out, 1)
-
-    # Abort sequences
-    for req_id in req_id_list:
-        scheduler.abort_seq_group(req_id)
-        # - Verify that sequence group cross-attention block tables are
-        #   NO LONGER registered with the block manager
-        assert req_id not in scheduler.block_manager.cross_block_tables
diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py
deleted file mode 100644
index ee9ac2129f2d..000000000000
--- a/tests/core/test_serialization.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import msgspec
-
-from vllm.executor.msgspec_utils import decode_hook, encode_hook
-from vllm.sequence import ExecuteModelRequest
-
-from .utils import create_batch
-
-
-def test_msgspec_serialization():
-    num_lookahead_slots = 4
-    seq_group_metadata_list, _, _ = create_batch(16, num_lookahead_slots)
-    execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=num_lookahead_slots,
-        running_queue_size=4)
-
-    encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook)
-    decoder = msgspec.msgpack.Decoder(ExecuteModelRequest,
-                                      dec_hook=decode_hook)
-    req = decoder.decode(encoder.encode(execute_model_req))
-    expected = execute_model_req.seq_group_metadata_list
-    actual = req.seq_group_metadata_list
-    assert (len(expected) == len(actual))
-    expected = expected[0]
-    actual = actual[0]
-
-    assert expected.block_tables == actual.block_tables
-    assert expected.is_prompt == actual.is_prompt
-    assert expected.request_id == actual.request_id
-    assert (expected.seq_data[0].prompt_token_ids ==
-            actual.seq_data[0].prompt_token_ids)
-    assert (expected.seq_data[0].output_token_ids ==
-            actual.seq_data[0].output_token_ids)
diff --git a/tests/core/utils.py b/tests/core/utils.py
deleted file mode 100644
index 033fffd2c4e2..000000000000
--- a/tests/core/utils.py
+++ /dev/null
@@ -1,392 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import time
-from collections import defaultdict
-from collections.abc import Sequence as GenericSequence
-from itertools import count
-from typing import Any, Optional, Union
-
-import torch
-
-from vllm.core.scheduler import Scheduler, SchedulerOutputs
-from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata)
-
-
-def create_dummy_prompt(
-    request_id: str,
-    prompt_length: int = -1,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-    prompt_tokens: Optional[list[int]] = None,
-    prompt_embeds: Optional[torch.Tensor] = None,
-    min_tokens: int = 0,
-    max_tokens: int = 16,
-) -> tuple[Sequence, SequenceGroup]:
-    if not block_size:
-        block_size = prompt_length
-
-    if prompt_tokens is None:
-        # Create dummy prompt sequence with tokens 0...block_size-1
-        # and prompt "0 ... block_size".
-        prompt_tokens = list(range(prompt_length))
-
-    prompt_str = " ".join([str(t) for t in prompt_tokens])
-    inputs = token_inputs(
-        prompt_token_ids=prompt_tokens,
-        prompt=prompt_str) if prompt_embeds is None else embeds_inputs(
-            prompt_embeds=prompt_embeds)
-    prompt = Sequence(
-        int(request_id),
-        inputs=inputs,
-        block_size=block_size,
-    )
-    seq_group = SequenceGroup(
-        request_id=request_id,
-        seqs=[prompt],
-        arrival_time=time.time(),
-        sampling_params=SamplingParams(max_tokens=max_tokens,
-                                       min_tokens=min_tokens),
-        lora_request=lora_request,
-    )
-
-    return prompt, seq_group
-
-
-def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
-                               block_size: int, lora_int_id: int) -> Sequence:
-    return Sequence(seq_id=request_id,
-                    inputs=token_inputs(token_ids),
-                    block_size=block_size,
-                    lora_request=LoRARequest(lora_name="dummy",
-                                             lora_path="/dummy",
-                                             lora_int_id=lora_int_id))
-
-
-def create_dummy_sequence(request_id: int, token_ids: list[int],
-                          block_size: int) -> Sequence:
-    return Sequence(
-        seq_id=request_id,
-        inputs=token_inputs(token_ids),
-        block_size=block_size,
-    )
-
-
-def create_dummy_prompt_encoder_decoder(
-    request_id: str,
-    decoder_prompt_length: int,
-    encoder_prompt_length: int,
-    block_size: Optional[int] = None,
-    lora_request: Optional[LoRARequest] = None,
-) -> tuple[Sequence, Sequence, SequenceGroup]:
-    if not block_size:
-        block_size = decoder_prompt_length
-
-    # Create dummy prompt sequence with tokens 0...block_size-1
-    # and prompt "0 ... block_size". Note that the prompt string
-    # doesn't actually match the tokens
-    decoder_prompt_tokens = list(range(decoder_prompt_length))
-    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
-    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
-    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
-
-    inputs: EncoderDecoderInputs = {
-        "decoder": token_inputs(decoder_prompt_tokens,
-                                prompt=decoder_prompt_str),
-        "encoder": token_inputs(encoder_prompt_tokens,
-                                prompt=encoder_prompt_str),
-    }
-
-    decoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["decoder"],
-                              block_size=block_size)
-
-    encoder_prompt = Sequence(int(request_id),
-                              inputs=inputs["encoder"],
-                              block_size=block_size)
-
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[decoder_prompt],
-                              arrival_time=time.time(),
-                              lora_request=lora_request,
-                              encoder_seq=encoder_prompt)
-
-    return decoder_prompt, encoder_prompt, seq_group
-
-
-def create_seq_group(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-
-    assert len(seq_output_lens) > 0
-
-    if sampling_params is None:
-        sampling_params = SamplingParams()
-
-    prompt_token_ids = [0] * seq_prompt_len
-
-    seqs: list[Sequence] = []
-    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        seq = Sequence(
-            seq_id=seq_id_start + seq_id_offset,
-            inputs=token_inputs(prompt_token_ids),
-            block_size=16,
-        )
-
-        for i in range(output_len):
-            seq.append_token_id(
-                token_id=i,
-                logprobs={i: Logprob(0.0)},
-            )
-        seqs.append(seq)
-
-    seq_group = SequenceGroup(
-        request_id=request_id,
-        seqs=seqs,
-        sampling_params=sampling_params,
-        arrival_time=time.time(),
-    )
-
-    return seq_group
-
-
-def create_seq_group_encoder_decoder(
-        seq_prompt_len: int = 1024,
-        seq_output_lens: GenericSequence[int] = (128, ),
-        request_id: str = '0',
-        seq_id_start: int = 0,
-        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
-
-    assert len(seq_output_lens) > 0
-
-    if sampling_params is None:
-        sampling_params = SamplingParams()
-
-    prompt_token_ids = [0] * seq_prompt_len
-
-    inputs: EncoderDecoderInputs = {
-        "decoder": token_inputs(prompt_token_ids),
-        "encoder": token_inputs(prompt_token_ids),
-    }
-
-    seqs = []
-    for seq_id_offset, output_len in enumerate(seq_output_lens):
-        # Construct decoder input sequences
-        seq = Sequence(
-            seq_id=seq_id_start + seq_id_offset,
-            inputs=inputs["decoder"],
-            block_size=16,
-        )
-
-        for i in range(output_len):
-            seq.append_token_id(
-                token_id=i,
-                logprobs={i: Logprob(0.0)},
-            )
-        seqs.append(seq)
-
-    # Encoder input sequence
-    encoder_seq = Sequence(
-        seq_id=seq_id_start + len(seq_output_lens),
-        inputs=inputs["encoder"],
-        block_size=16,
-    )
-
-    return SequenceGroup(request_id=request_id,
-                         seqs=seqs,
-                         sampling_params=sampling_params,
-                         arrival_time=time.time(),
-                         encoder_seq=encoder_seq)
-
-
-def round_up_to_next_block(seq_len: int, block_size: int) -> int:
-    return (seq_len + block_size - 1) // block_size
-
-
-# Helper functions for scheduler tests
-
-
-def get_sequence_groups(scheduler_output):
-    return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
-
-
-def append_new_token(out, token_id: int):
-    seq_groups = get_sequence_groups(out)
-    for seq_group in seq_groups:
-        for seq in seq_group.get_seqs():
-            seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-
-
-def schedule_and_update_computed_tokens(scheduler):
-    metas, out, _ = scheduler.schedule()
-    for s in out.scheduled_seq_groups:
-        s.seq_group.update_num_computed_tokens(s.token_chunk_size)
-    return metas, out
-
-
-def append_new_token_seq(seq: Sequence, token_id: int):
-    seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-
-
-def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int):
-    seq_group.update_num_computed_tokens(token_chunk_size)
-    for seq in seq_group.get_seqs():
-        seq.append_token_id(token_id, {token_id: Logprob(token_id)})
-
-
-class SchedulerProxy:
-    """
-    A proxy class to forward calls to the scheduler.
-    """
-
-    def __init__(self, scheduler: Scheduler):
-        self.scheduler_ = scheduler
-        self.call_history: dict[str, list[Any]] = defaultdict(list)
-
-    def __getattr__(self, name: str) -> Any:
-
-        def wrapper(*args, **kwargs):
-            result = getattr(self.scheduler_, name)(*args, **kwargs)
-            self.call_history[name].append((args, kwargs, result))
-            return result
-
-        return wrapper
-
-    def last_schedule_ret(
-        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
-        _, _, ret = self.call_history["schedule"][-1]
-        return ret
-
-
-def create_seq_group_metadata_from_prompts(
-    prompts: list[list[int]],
-    num_gpu_blocks: int,
-    block_size: int,
-    final_prompt_lens: list[int],
-    continuations: Optional[list[list[int]]] = None,
-    seq_ids: Optional[list[int]] = None,
-) -> list[SequenceGroupMetadata]:
-
-    if continuations is None:
-        continuations = [[] for _ in prompts]
-
-    if seq_ids is None:
-        seq_ids = list(i for i, _ in enumerate(prompts))
-
-    free_gpu_blocks = list(range(num_gpu_blocks))
-
-    block_allocations = {
-        i: [
-            free_gpu_blocks.pop()
-            for _ in range(round_up_to_next_block(final_len, block_size))
-        ]
-        for i, final_len in enumerate(final_prompt_lens)
-    }
-
-    seq_grou_metadata_list = []
-    for i, (prompt_token_ids,
-            cont_token_ids) in enumerate(zip(prompts, continuations)):
-        data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids)
-        data.update_num_computed_tokens(
-            len(prompt_token_ids) + len(cont_token_ids) - 1)
-        seq_data = {i: data}
-        seq_grou_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=str(i),
-                is_prompt=len(cont_token_ids) == 0,
-                seq_data=seq_data,
-                sampling_params=SamplingParams(temperature=0.0),
-                block_tables={i: block_allocations[i][:]},
-            ))
-    return seq_grou_metadata_list
-
-
-def create_chunked_seq_group_metadata_from_prompt(
-        prompt: list[int],
-        num_gpu_blocks: int,
-        chunk_size: int,
-        block_size: int,
-        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
-
-    if seq_id is None:
-        seq_id = 0
-
-    free_gpu_blocks = list(range(num_gpu_blocks))
-
-    block_allocations = [
-        free_gpu_blocks.pop()
-        for _ in range(round_up_to_next_block(len(prompt), block_size))
-    ]
-
-    seq_group_metadata_list = []
-    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
-        chunk_ids = prompt[idx:idx + chunk_size]
-        data = SequenceData.from_seqs(prompt)
-        data.update_num_computed_tokens(idx)
-        seq_data = {i: data}
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=str(seq_id),
-                is_prompt=True,
-                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
-                seq_data=seq_data,
-                sampling_params=SamplingParams(temperature=0.0),
-                block_tables={i: block_allocations},
-                token_chunk_size=len(chunk_ids)))
-    return seq_group_metadata_list
-
-
-def create_batch(batch_size,
-                 k,
-                 prompt_len: Union[int, list[int]] = 10,
-                 prev_output_token_len: int = 10,
-                 seq_ids: Optional[list[int]] = None,
-                 num_gpu_blocks: Optional[int] = None,
-                 block_size: Optional[int] = None,
-                 prefill_chunk_size: Optional[int] = None):
-    if block_size is None:
-        block_size = 8
-
-    if num_gpu_blocks is None:
-        num_gpu_blocks = 2048 // block_size
-
-    iterator = count()
-
-    if isinstance(prompt_len, int):
-        prompt_lens = [prompt_len for _ in range(batch_size)]
-    else:
-        prompt_lens = prompt_len
-
-    prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
-
-    if prefill_chunk_size:
-        # Create a batch of chunked prompts.
-        if not seq_ids:
-            seq_ids = list(range(len(prompts)))
-        seq_group_metadata_list = []
-        for p, sid in zip(prompts, seq_ids):
-            seq_group_metadata_list += \
-                create_chunked_seq_group_metadata_from_prompt(
-                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
-        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
-        prev_output_tokens = []
-    else:
-        prev_output_tokens = [[
-            next(iterator) for _ in range(prev_output_token_len)
-        ] for _ in range(batch_size)]
-        final_prompt_lens = [
-            len(prompt) + len(prev_output_token) + k + 1
-            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
-        ]
-
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts, num_gpu_blocks, block_size, final_prompt_lens,
-            prev_output_tokens, seq_ids)
-    return seq_group_metadata_list, prompts, prev_output_tokens
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 2391430a083a..f2bcf5bba4bb 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -26,18 +26,6 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    For PP, we fall back to V0 by default. This means
-    that the TP baseline runs with V1 while the PP engine
-    runs with V0. This gives divergent results with dummy
-    weights. Once we enable V1 by default for PP, we can
-    remove this.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
diff --git a/tests/engine/__init__.py b/tests/engine/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
deleted file mode 100644
index 5a91758414a5..000000000000
--- a/tests/engine/test_arg_utils.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import json
-from argparse import ArgumentError
-from contextlib import nullcontext
-from dataclasses import dataclass, field
-from typing import Annotated, Literal, Optional
-
-import pytest
-
-from vllm.config import CompilationConfig, config
-from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs,
-                                   get_type, get_type_hints, is_not_builtin,
-                                   is_type, literal_to_kwargs, optional_type,
-                                   parse_type)
-from vllm.utils import FlexibleArgumentParser
-
-
-@pytest.mark.parametrize(("type", "value", "expected"), [
-    (int, "42", 42),
-    (float, "3.14", 3.14),
-    (str, "Hello World!", "Hello World!"),
-    (json.loads, '{"foo":1,"bar":2}', {
-        "foo": 1,
-        "bar": 2
-    }),
-])
-def test_parse_type(type, value, expected):
-    parse_type_func = parse_type(type)
-    assert parse_type_func(value) == expected
-
-
-def test_optional_type():
-    optional_type_func = optional_type(int)
-    assert optional_type_func("None") is None
-    assert optional_type_func("42") == 42
-
-
-@pytest.mark.parametrize(("type_hint", "type", "expected"), [
-    (int, int, True),
-    (int, float, False),
-    (list[int], list, True),
-    (list[int], tuple, False),
-    (Literal[0, 1], Literal, True),
-])
-def test_is_type(type_hint, type, expected):
-    assert is_type(type_hint, type) == expected
-
-
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({float, int}, int, True),
-    ({int, tuple[int]}, int, True),
-    ({int, tuple[int]}, float, False),
-    ({str, Literal["x", "y"]}, Literal, True),
-])
-def test_contains_type(type_hints, type, expected):
-    assert contains_type(type_hints, type) == expected
-
-
-@pytest.mark.parametrize(("type_hints", "type", "expected"), [
-    ({int, float}, int, int),
-    ({int, float}, str, None),
-    ({str, Literal["x", "y"]}, Literal, Literal["x", "y"]),
-])
-def test_get_type(type_hints, type, expected):
-    assert get_type(type_hints, type) == expected
-
-
-@pytest.mark.parametrize(("type_hints", "expected"), [
-    ({Literal[1, 2]}, {
-        "type": int,
-        "choices": [1, 2]
-    }),
-    ({Literal[1, "a"]}, Exception),
-])
-def test_literal_to_kwargs(type_hints, expected):
-    context = nullcontext()
-    if expected is Exception:
-        context = pytest.raises(expected)
-    with context:
-        assert literal_to_kwargs(type_hints) == expected
-
-
-@config
-@dataclass
-class NestedConfig:
-    field: int = 1
-    """field"""
-
-
-@config
-@dataclass
-class FromCliConfig1:
-    field: int = 1
-    """field"""
-
-    @classmethod
-    def from_cli(cls, cli_value: str):
-        inst = cls(**json.loads(cli_value))
-        inst.field += 1
-        return inst
-
-
-@config
-@dataclass
-class FromCliConfig2:
-    field: int = 1
-    """field"""
-
-    @classmethod
-    def from_cli(cls, cli_value: str):
-        inst = cls(**json.loads(cli_value))
-        inst.field += 2
-        return inst
-
-
-@config
-@dataclass
-class DummyConfig:
-    regular_bool: bool = True
-    """Regular bool with default True"""
-    optional_bool: Optional[bool] = None
-    """Optional bool with default None"""
-    optional_literal: Optional[Literal["x", "y"]] = None
-    """Optional literal with default None"""
-    tuple_n: tuple[int, ...] = field(default_factory=lambda: (1, 2, 3))
-    """Tuple with variable length"""
-    tuple_2: tuple[int, int] = field(default_factory=lambda: (1, 2))
-    """Tuple with fixed length"""
-    list_n: list[int] = field(default_factory=lambda: [1, 2, 3])
-    """List with variable length"""
-    list_literal: list[Literal[1, 2]] = field(default_factory=list)
-    """List with literal choices"""
-    literal_literal: Literal[Literal[1], Literal[2]] = 1
-    """Literal of literals with default 1"""
-    json_tip: dict = field(default_factory=dict)
-    """Dict which will be JSON in CLI"""
-    nested_config: NestedConfig = field(default_factory=NestedConfig)
-    """Nested config"""
-    from_cli_config1: FromCliConfig1 = field(default_factory=FromCliConfig1)
-    """Config with from_cli method"""
-    from_cli_config2: FromCliConfig2 = field(default_factory=FromCliConfig2)
-    """Different config with from_cli method"""
-
-
-@pytest.mark.parametrize(("type_hint", "expected"), [
-    (int, False),
-    (DummyConfig, True),
-])
-def test_is_not_builtin(type_hint, expected):
-    assert is_not_builtin(type_hint) == expected
-
-
-@pytest.mark.parametrize(
-    ("type_hint", "expected"), [
-        (Annotated[int, "annotation"], {int}),
-        (Optional[int], {int, type(None)}),
-        (Annotated[Optional[int], "annotation"], {int, type(None)}),
-        (Optional[Annotated[int, "annotation"]], {int, type(None)}),
-    ],
-    ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"])
-def test_get_type_hints(type_hint, expected):
-    assert get_type_hints(type_hint) == expected
-
-
-def test_get_kwargs():
-    kwargs = get_kwargs(DummyConfig)
-    print(kwargs)
-
-    # bools should not have their type set
-    assert kwargs["regular_bool"].get("type") is None
-    assert kwargs["optional_bool"].get("type") is None
-    # optional literals should have None as a choice
-    assert kwargs["optional_literal"]["choices"] == ["x", "y", "None"]
-    # tuples should have the correct nargs
-    assert kwargs["tuple_n"]["nargs"] == "+"
-    assert kwargs["tuple_2"]["nargs"] == 2
-    # lists should work
-    assert kwargs["list_n"]["type"] is int
-    assert kwargs["list_n"]["nargs"] == "+"
-    # lists with literals should have the correct choices
-    assert kwargs["list_literal"]["type"] is int
-    assert kwargs["list_literal"]["nargs"] == "+"
-    assert kwargs["list_literal"]["choices"] == [1, 2]
-    # literals of literals should have merged choices
-    assert kwargs["literal_literal"]["choices"] == [1, 2]
-    # dict should have json tip in help
-    json_tip = "Should either be a valid JSON string or JSON keys"
-    assert json_tip in kwargs["json_tip"]["help"]
-    # nested config should should construct the nested config
-    assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
-    # from_cli configs should be constructed with the correct method
-    assert kwargs["from_cli_config1"]["type"]('{"field": 2}').field == 3
-    assert kwargs["from_cli_config2"]["type"]('{"field": 2}').field == 4
-
-
-@pytest.mark.parametrize(
-    ("arg", "expected"),
-    [
-        (None, dict()),
-        ('{"video": {"num_frames": 123} }', {
-            "video": {
-                "num_frames": 123
-            }
-        }),
-        (
-            '{"video": {"num_frames": 123, "fps": 1.0, "foo": "bar"}, "image": {"foo": "bar"} }',  # noqa
-            {
-                "video": {
-                    "num_frames": 123,
-                    "fps": 1.0,
-                    "foo": "bar"
-                },
-                "image": {
-                    "foo": "bar"
-                }
-            }),
-    ])
-def test_media_io_kwargs_parser(arg, expected):
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    if arg is None:
-        args = parser.parse_args([])
-    else:
-        args = parser.parse_args(["--media-io-kwargs", arg])
-
-    assert args.media_io_kwargs == expected
-
-
-def test_compilation_config():
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-
-    # default value
-    args = parser.parse_args([])
-    assert args.compilation_config == CompilationConfig()
-
-    # set to O3
-    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.level == 0
-
-    # set to O 3 (space)
-    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.level == 1
-
-    # set to O 3 (equals)
-    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.level == 2
-
-    # set to O.level 3
-    args = parser.parse_args(["-O.level", "3"])
-    assert args.compilation_config.level == 3
-
-    # set to string form of a dict
-    args = parser.parse_args([
-        "-O",
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": false}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and not args.compilation_config.use_inductor)
-
-    # set to string form of a dict
-    args = parser.parse_args([
-        "--compilation-config="
-        '{"level": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-        '"use_inductor": true}',
-    ])
-    assert (args.compilation_config.level == 3 and
-            args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-            and args.compilation_config.use_inductor)
-
-
-def test_prefix_cache_default():
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    args = parser.parse_args([])
-
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert (not engine_args.enable_prefix_caching
-            ), "prefix caching defaults to off."
-
-    # with flag to turn it on.
-    args = parser.parse_args(["--enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching
-
-    # with disable flag to turn it off.
-    args = parser.parse_args(["--no-enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching
-
-
-# yapf: disable
-@pytest.mark.parametrize(("arg", "expected", "option"), [
-    (None, None, "mm-processor-kwargs"),
-    ("{}", {}, "mm-processor-kwargs"),
-    (
-        '{"num_crops": 4}',
-        {
-            "num_crops": 4
-        },
-        "mm-processor-kwargs"
-    ),
-    (
-        '{"foo": {"bar": "baz"}}',
-        {
-            "foo":
-            {
-                "bar": "baz"
-            }
-        },
-        "mm-processor-kwargs"
-    ),
-    (
-        '{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
-        {
-            "cast_logits_dtype": "bfloat16",
-            "sequence_parallel_norm": True,
-            "sequence_parallel_norm_threshold": 2048,
-        },
-        "override-neuron-config"
-    ),
-])
-# yapf: enable
-def test_composite_arg_parser(arg, expected, option):
-    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    if arg is None:
-        args = parser.parse_args([])
-    else:
-        args = parser.parse_args([f"--{option}", arg])
-    assert getattr(args, option.replace("-", "_")) == expected
-
-
-def test_human_readable_model_len():
-    # `exit_on_error` disabled to test invalid values below
-    parser = EngineArgs.add_cli_args(
-        FlexibleArgumentParser(exit_on_error=False))
-
-    args = parser.parse_args([])
-    assert args.max_model_len is None
-
-    args = parser.parse_args(["--max-model-len", "1024"])
-    assert args.max_model_len == 1024
-
-    # Lower
-    args = parser.parse_args(["--max-model-len", "1m"])
-    assert args.max_model_len == 1_000_000
-    args = parser.parse_args(["--max-model-len", "10k"])
-    assert args.max_model_len == 10_000
-
-    # Capital
-    args = parser.parse_args(["--max-model-len", "3K"])
-    assert args.max_model_len == 1024 * 3
-    args = parser.parse_args(["--max-model-len", "10M"])
-    assert args.max_model_len == 2**20 * 10
-
-    # Decimal values
-    args = parser.parse_args(["--max-model-len", "10.2k"])
-    assert args.max_model_len == 10200
-    # ..truncated to the nearest int
-    args = parser.parse_args(["--max-model-len", "10.212345k"])
-    assert args.max_model_len == 10212
-
-    # Invalid (do not allow decimals with binary multipliers)
-    for invalid in ["1a", "pwd", "10.24", "1.23M"]:
-        with pytest.raises(ArgumentError):
-            args = parser.parse_args(["--max-model-len", invalid])
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
deleted file mode 100644
index ac5a1f957dfe..000000000000
--- a/tests/engine/test_computed_prefix_blocks.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.sampling_params import SamplingParams
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("block_size", [16])
-def test_computed_prefix_blocks(model: str, block_size: int):
-    # This test checks if we are able to run the engine to completion
-    # without triggering asserts.
-    # We are in a scenario where all blocks from the second request's prompt
-    # are full and already computed when the second request arrives.
-    prompt = (
-        "You are a helpful assistant. How do I build a car from cardboard and "
-        "paper clips? Is there an easy to follow video tutorial available "
-        "online for free?")
-    prompt2 = (
-        " Please recommend to me some resources where I can learn not only to "
-        "handle technical difficulties of building a car, but also "
-        "decoration.")
-
-    engine_args = EngineArgs(model=model,
-                             block_size=block_size,
-                             enable_prefix_caching=True)
-
-    engine = LLMEngine.from_engine_args(engine_args)
-    sampling_params = SamplingParams()
-
-    engine.add_request("0", prompt + prompt2, sampling_params)
-    engine.step()
-    engine.add_request("1", prompt, sampling_params)
-    engine.step()
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
deleted file mode 100644
index 15c7a97b50e1..000000000000
--- a/tests/engine/test_executor.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-from typing import Any, Callable, Optional, Union
-
-import pytest
-
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.llm_engine import LLMEngine
-from vllm.executor.uniproc_executor import UniProcExecutor
-from vllm.sampling_params import SamplingParams
-
-
-class Mock:
-    ...
-
-
-class CustomUniExecutor(UniProcExecutor):
-
-    def collective_rpc(self,
-                       method: Union[str, Callable],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict] = None) -> list[Any]:
-        # Drop marker to show that this was ran
-        with open(".marker", "w"):
-            ...
-        return super().collective_rpc(method, timeout, args, kwargs)
-
-
-CustomUniExecutorAsync = CustomUniExecutor
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor_type_checking(model):
-    with pytest.raises(ValueError):
-        engine_args = EngineArgs(model=model,
-                                 distributed_executor_backend=Mock)
-        LLMEngine.from_engine_args(engine_args)
-    with pytest.raises(ValueError):
-        engine_args = AsyncEngineArgs(model=model,
-                                      distributed_executor_backend=Mock)
-        AsyncLLMEngine.from_engine_args(engine_args)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor(model, tmp_path):
-    cwd = os.path.abspath(".")
-    os.chdir(tmp_path)
-    try:
-        assert not os.path.exists(".marker")
-
-        engine_args = EngineArgs(
-            model=model,
-            distributed_executor_backend=CustomUniExecutor,
-            enforce_eager=True,  # reduce test time
-        )
-        engine = LLMEngine.from_engine_args(engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
-
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
-
-        assert os.path.exists(".marker")
-    finally:
-        os.chdir(cwd)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_custom_executor_async(model, tmp_path):
-    cwd = os.path.abspath(".")
-    os.chdir(tmp_path)
-    try:
-        assert not os.path.exists(".marker")
-
-        engine_args = AsyncEngineArgs(
-            model=model,
-            distributed_executor_backend=CustomUniExecutorAsync,
-            enforce_eager=True,  # reduce test time
-        )
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        sampling_params = SamplingParams(max_tokens=1)
-
-        async def t():
-            stream = await engine.add_request("0", "foo", sampling_params)
-            async for x in stream:
-                ...
-
-        asyncio.run(t())
-
-        assert os.path.exists(".marker")
-    finally:
-        os.chdir(cwd)
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_respect_ray(model):
-    # even for TP=1 and PP=1,
-    # if users specify ray, we should use ray.
-    # users might do this if they want to manage the
-    # resources using ray.
-    engine_args = EngineArgs(
-        model=model,
-        distributed_executor_backend="ray",
-        enforce_eager=True,  # reduce test time
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    assert engine.model_executor.uses_ray
diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py
deleted file mode 100644
index 458f4deb743a..000000000000
--- a/tests/engine/test_multi_step_output_processor.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import MagicMock
-
-import pytest
-from transformers import PreTrainedTokenizer
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
-                           SequenceOutput, SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-from ..core.utils import create_seq_group
-
-
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [1, 12])
-@pytest.mark.skip_global_cleanup
-def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
-    """Verify multi-step decoding appends token ids correctly.
-
-    We append token ids and verify all the token ids were appended correctly.
-    Note that ignore_eos=True.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=1024,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=seq_output_len +
-                                       num_new_tokens,
-                                       ignore_eos=True),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
-    output_processor.process_outputs(seq_group, outputs)
-    assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
-@pytest.mark.parametrize("max_tokens", [128 + 3])
-@pytest.mark.skip_global_cleanup
-def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
-                             seq_output_len: int, max_tokens: int):
-    """Verify tokens after max_tokens are dropped and not appended to the
-    sequence.
-    """
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(max_tokens=max_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go over max tokens in len.
-    assert seq.get_len() == seq_prompt_len + max_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                               seq_output_len: int, seed: int):
-    """Verify the eos token id is included in the sequence, but subsequent
-    tokens are dropped (not appended to sequence).
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens, ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to not go beyond provided eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:eos_index + 1]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-@pytest.mark.parametrize("seq_prompt_len", [1024])
-@pytest.mark.parametrize("seq_output_len", [128])
-@pytest.mark.parametrize("num_new_tokens", [12])
-@pytest.mark.parametrize("seed", list(range(6)))
-@pytest.mark.skip_global_cleanup
-def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
-                              seq_output_len: int, seed: int):
-    """When sampling parameters dictate that we should ignore the eos token id,
-    ensure all token ids are appended even if the eos token id is emitted.
-    """
-    random.seed(seed)
-    detokenizer = MagicMock(spec=Detokenizer)
-    scheduler = MagicMock(spec=Scheduler)
-    stop_checker = MagicMock(spec=StopChecker)
-    seq_counter = Counter()
-
-    eos_token_id = 100
-
-    output_processor = MultiStepOutputProcessor(
-        detokenizer=detokenizer,
-        scheduler=[scheduler],
-        seq_counter=seq_counter,
-        get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
-        stop_checker=stop_checker,
-    )
-
-    seq_group = create_seq_group(
-        seq_prompt_len=seq_prompt_len,
-        seq_output_lens=[seq_output_len],
-        sampling_params=SamplingParams(
-            # Ensure enough space.
-            max_tokens=seq_output_len + num_new_tokens,
-            ignore_eos=True,
-        ),
-    )
-
-    seq = seq_group.get_seqs()[0]
-    seq.status = SequenceStatus.RUNNING
-
-    new_token_ids = list(range(num_new_tokens))
-    assert eos_token_id not in new_token_ids
-    eos_index = random.randint(0, len(new_token_ids) - 1)
-    new_token_ids[eos_index] = eos_token_id
-
-    outputs = [
-        CompletionSequenceGroupOutput(
-            samples=[
-                SequenceOutput(
-                    parent_seq_id=seq.seq_id,
-                    output_token=output_token,
-                    logprobs={output_token: Logprob(0.0)},
-                )
-            ],
-            prompt_logprobs=None,
-        ) for output_token in new_token_ids
-    ]
-
-    assert seq.get_len() == seq_prompt_len + seq_output_len
-    output_processor.process_outputs(seq_group, outputs)
-
-    # Expect the processed sequence to go beyond eos.
-    assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
-
-    # Expect the correct tokens were appended.
-    expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
-                                             seq_output_len]
-    assert seq.get_token_ids(
-    )[-len(expected_appended_tokens):] == expected_appended_tokens
-
-
-def mock_tokenizer(eos_token_id=1000):
-    tokenizer = MagicMock(spec=PreTrainedTokenizer)
-    tokenizer.eos_token_id = eos_token_id
-    return tokenizer
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
deleted file mode 100644
index b5381b61a020..000000000000
--- a/tests/engine/test_multiproc_workers.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-from concurrent.futures import ThreadPoolExecutor
-from functools import partial
-from time import sleep
-from typing import Any
-
-import pytest
-
-from vllm.config import VllmConfig
-from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
-                                                  ResultHandler, WorkerMonitor)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-
-class DummyWorkerWrapper(WorkerWrapperBase):
-    """Dummy version of vllm.worker.worker.Worker"""
-
-    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
-        sleep(0.05)
-
-        if isinstance(worker_input, Exception):
-            # simulate error case
-            raise worker_input
-
-        return self.rpc_rank, input
-
-
-def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
-    result_handler = ResultHandler()
-    vllm_config = VllmConfig()
-    workers = [
-        ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
-                             rank) for rank in range(8)
-    ]
-
-    worker_monitor = WorkerMonitor(workers, result_handler)
-    assert not worker_monitor.is_alive()
-
-    result_handler.start()
-    worker_monitor.start()
-    assert worker_monitor.is_alive()
-
-    return workers, worker_monitor
-
-
-def test_local_workers() -> None:
-    """Test workers with sync task submission"""
-
-    workers, worker_monitor = _start_workers()
-
-    def execute_workers(worker_input: str) -> None:
-        worker_outputs = [
-            worker.execute_method("worker_method", worker_input)
-            for worker in workers
-        ]
-
-        for rank, output in enumerate(worker_outputs):
-            assert output.get() == (rank, input)
-
-    executor = ThreadPoolExecutor(max_workers=4)
-
-    # Test concurrent submission from different threads
-    futures = [
-        executor.submit(partial(execute_workers, f"thread {thread_num}"))
-        for thread_num in range(4)
-    ]
-
-    for future in futures:
-        future.result()
-
-    # Test error case
-    exception = ValueError("fake error")
-    result = workers[0].execute_method("worker_method", exception)
-    try:
-        result.get()
-        pytest.fail("task should have failed")
-    except Exception as e:
-        assert isinstance(e, ValueError)
-        assert str(e) == "fake error"
-
-    # Test cleanup when a worker fails
-    assert worker_monitor.is_alive()
-    workers[3].process.kill()
-
-    # Other workers should get shut down here
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = workers[0].execute_method("worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
-
-
-def test_local_workers_clean_shutdown() -> None:
-    """Test clean shutdown"""
-
-    workers, worker_monitor = _start_workers()
-
-    assert worker_monitor.is_alive()
-    assert all(worker.process.is_alive() for worker in workers)
-
-    # Clean shutdown
-    worker_monitor.close()
-
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = workers[0].execute_method("worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
-
-
-@pytest.mark.asyncio
-async def test_local_workers_async() -> None:
-    """Test local workers with async task submission"""
-
-    workers, worker_monitor = _start_workers()
-
-    async def execute_workers(worker_input: str) -> None:
-        worker_coros = [
-            worker.execute_method_async("worker_method", worker_input)
-            for worker in workers
-        ]
-
-        results = await asyncio.gather(*worker_coros)
-        for rank, result in enumerate(results):
-            assert result == (rank, input)
-
-    tasks = [
-        asyncio.create_task(execute_workers(f"task {task_num}"))
-        for task_num in range(4)
-    ]
-
-    for task in tasks:
-        await task
-
-    # Test error case
-    exception = ValueError("fake error")
-    try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", exception)
-        pytest.fail("task should have failed")
-    except Exception as e:
-        assert isinstance(e, ValueError)
-        assert str(e) == "fake error"
-
-    # Test cleanup when a worker fails
-    assert worker_monitor.is_alive()
-    workers[3].process.kill()
-
-    # Other workers should get shut down here
-    worker_monitor.join(20)
-
-    # Ensure everything is stopped
-    assert not worker_monitor.is_alive()
-    assert all(not worker.process.is_alive() for worker in workers)
-
-    # Further attempts to submit tasks should fail
-    try:
-        _result = await workers[0].execute_method_async(
-            "worker_method", "test")
-        pytest.fail("task should fail once workers have been shut down")
-    except Exception as e:
-        assert isinstance(e, ChildProcessError)
diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py
deleted file mode 100644
index 42e88e84770a..000000000000
--- a/tests/engine/test_options.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from contextlib import nullcontext
-
-import pytest
-
-from vllm.entrypoints.llm import LLM
-from vllm.sampling_params import SamplingParams
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-def test_skip_tokenizer_initialization(model: str):
-    # This test checks if the flag skip_tokenizer_init skips the initialization
-    # of tokenizer and detokenizer. The generated output is expected to contain
-    # token ids.
-    llm = LLM(
-        model=model,
-        skip_tokenizer_init=True,
-        enforce_eager=True,
-    )
-    sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
-
-    with pytest.raises(ValueError, match="cannot pass text prompts when"):
-        llm.generate("abc", sampling_params)
-
-    outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
-                           sampling_params=sampling_params)
-    assert len(outputs) > 0
-    completions = outputs[0].outputs
-    assert len(completions) > 0
-    assert completions[0].text == ""
-    assert completions[0].token_ids
-
-
-@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
-@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
-def test_enable_prompt_embeds(hf_runner, model: str,
-                              enable_prompt_embeds: bool):
-    prompt = "abc"
-
-    with hf_runner(model) as hf_model:
-        token_ids = hf_model.tokenizer(prompt, return_tensors="pt").input_ids
-        token_ids = token_ids.to(hf_model.model.device)
-
-        embed_layer = hf_model.model.get_input_embeddings()
-        prompt_embeds = embed_layer(token_ids).squeeze(0)
-
-    ctx = (nullcontext() if enable_prompt_embeds else pytest.raises(
-        ValueError, match="set `--enable-prompt-embeds`"))
-
-    llm = LLM(
-        model=model,
-        enable_prompt_embeds=enable_prompt_embeds,
-        enforce_eager=True,
-    )
-
-    with ctx:
-        llm.generate({"prompt_embeds": prompt_embeds})
diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py
deleted file mode 100644
index 9c62761d78af..000000000000
--- a/tests/engine/test_short_mm_context.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from ..conftest import IMAGE_ASSETS
-
-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
-    "cherry_blossom":
-    "USER: <image>\nWhat is the season?\nASSISTANT:",
-})
-
-models = ["llava-hf/llava-1.5-7b-hf"]
-
-
-@pytest.mark.parametrize("model", models)
-def test_context_length_too_short(vllm_runner, image_assets, model):
-    images = [asset.pil_image for asset in image_assets]
-
-    with pytest.raises(ValueError,
-                       match="longer than the maximum model length"):
-        vllm_model = vllm_runner(
-            model,
-            max_model_len=128,  # LLaVA has a feature size of 576
-            enforce_eager=True,
-        )
-
-        with vllm_model:
-            vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
-                                       max_tokens=1,
-                                       images=[images[0]])
diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
deleted file mode 100644
index 8cae8a80d38e..000000000000
--- a/tests/metrics/test_metrics.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import ray
-from prometheus_client import REGISTRY
-
-import vllm.envs as envs
-from vllm import EngineArgs, LLMEngine
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.metrics import RayPrometheusStatLogger
-from vllm.sampling_params import SamplingParams
-from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_metric_counter_prompt_tokens(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
-        tokenizer = vllm_model.llm.get_tokenizer()
-        prompt_token_counts = [
-            len(tokenizer.encode(p)) for p in example_prompts
-        ]
-        # This test needs at least 2 prompts in a batch of different lengths to
-        # verify their token count is correct despite padding.
-        assert len(example_prompts) > 1, "at least 2 prompts are required"
-        assert prompt_token_counts[0] != prompt_token_counts[1], (
-            "prompts of different lengths are required")
-        vllm_prompt_token_count = sum(prompt_token_counts)
-
-        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
-            **stat_logger.labels)._value.get()
-
-    assert vllm_prompt_token_count == metric_count, (
-        f"prompt token count: {vllm_prompt_token_count!r}\n"
-        f"metric: {metric_count!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [128])
-def test_metric_counter_generation_tokens(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.4) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.llm.get_tokenizer()
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
-        vllm_generation_count = 0
-        for i in range(len(example_prompts)):
-            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-            prompt_ids = tokenizer.encode(example_prompts[i])
-            # vllm_output_ids contains both prompt tokens and generation tokens.
-            # We're interested only in the count of the generation tokens.
-            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
-
-    assert vllm_generation_count == metric_count, (
-        f"generation token count: {vllm_generation_count!r}\n"
-        f"metric: {metric_count!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("max_tokens", [128, 129])
-@pytest.mark.parametrize("disable_async_output_proc", [True, False])
-def test_metric_counter_generation_tokens_multi_step(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    max_tokens: int,
-    disable_async_output_proc: bool,
-) -> None:
-    num_scheduler_steps = 8
-    with vllm_runner(
-            model,
-            disable_log_stats=False,
-            gpu_memory_utilization=0.4,
-            num_scheduler_steps=num_scheduler_steps,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.llm.get_tokenizer()
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-            **stat_logger.labels)._value.get()
-        vllm_generation_count = 0
-        for i in range(len(example_prompts)):
-            vllm_output_ids, vllm_output_str = vllm_outputs[i]
-            prompt_ids = tokenizer.encode(example_prompts[i])
-            # vllm_output_ids contains both prompt tokens and generation tokens.
-            # We're interested only in the count of the generation tokens.
-            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
-
-    # The multi-step scheduling will continue to execute forward even when
-    # encountering EOS, leading to slightly imprecise metrics.
-    assert abs(vllm_generation_count - metric_count) <\
-        len(example_prompts) * num_scheduler_steps, \
-        (f"generation token count: {vllm_generation_count!r}\n"
-         f"metric: {metric_count!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize(
-    "served_model_name",
-    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
-def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: list[str]) -> None:
-    with vllm_runner(model,
-                     dtype=dtype,
-                     disable_log_stats=False,
-                     gpu_memory_utilization=0.3,
-                     served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
-        metrics_tag_content = stat_logger.labels["model_name"]
-
-    if envs.VLLM_CI_USE_S3:
-        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
-    if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == model, (
-            f"Metrics tag model_name is wrong! expect: {model!r}\n"
-            f"actual: {metrics_tag_content!r}")
-    else:
-        assert metrics_tag_content == served_model_name[0], (
-            f"Metrics tag model_name is wrong! expect: "
-            f"{served_model_name[0]!r}\n"
-            f"actual: {metrics_tag_content!r}")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("disable_log_stats", [True, False])
-@pytest.mark.asyncio
-async def test_async_engine_log_metrics_regression(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    disable_log_stats: bool,
-) -> None:
-    """
-    Regression test ensuring async engine generates metrics
-    when disable_log_stats=False
-    (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
-    """
-    engine_args = AsyncEngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=disable_log_stats,
-    )
-    async_engine = AsyncLLMEngine.from_engine_args(engine_args)
-    for i, prompt in enumerate(example_prompts):
-        results = async_engine.generate(
-            prompt,
-            SamplingParams(max_tokens=max_tokens),
-            f"request-id-{i}",
-        )
-        # Exhaust the async iterator to make the async engine work
-        async for _ in results:
-            pass
-
-    assert_metrics(model, async_engine.engine, disable_log_stats,
-                   len(example_prompts))
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("disable_log_stats", [True, False])
-def test_engine_log_metrics_regression(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    disable_log_stats: bool,
-) -> None:
-    engine_args = EngineArgs(
-        model=model,
-        dtype=dtype,
-        disable_log_stats=disable_log_stats,
-    )
-    engine = LLMEngine.from_engine_args(engine_args)
-    for i, prompt in enumerate(example_prompts):
-        engine.add_request(
-            f"request-id-{i}",
-            prompt,
-            SamplingParams(max_tokens=max_tokens),
-        )
-    while engine.has_unfinished_requests():
-        engine.step()
-
-    if envs.VLLM_CI_USE_S3:
-        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
-    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
-
-
-def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
-                   num_requests: int) -> None:
-    if disable_log_stats:
-        with pytest.raises(AttributeError):
-            _ = engine.stat_loggers
-    else:
-        assert (engine.stat_loggers
-                is not None), "engine.stat_loggers should be set"
-        # Ensure the count bucket of request-level histogram metrics matches
-        # the number of requests as a simple sanity check to ensure metrics are
-        # generated
-        labels = {'model_name': model}
-        request_histogram_metrics = [
-            "vllm:e2e_request_latency_seconds",
-            "vllm:request_prompt_tokens",
-            "vllm:request_generation_tokens",
-            "vllm:request_params_n",
-            "vllm:request_params_max_tokens",
-        ]
-        for metric_name in request_histogram_metrics:
-            metric_value = REGISTRY.get_sample_value(f"{metric_name}_count",
-                                                     labels)
-            assert (
-                metric_value == num_requests), "Metrics should be collected"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [16])
-def test_engine_log_metrics_ray(
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    # This test is quite weak - it only checks that we can use
-    # RayPrometheusStatLogger without exceptions.
-    # Checking whether the metrics are actually emitted is unfortunately
-    # non-trivial.
-
-    # We have to run in a Ray task for Ray metrics to be emitted correctly
-    @ray.remote(num_gpus=1)
-    def _inner():
-
-        class _RayPrometheusStatLogger(RayPrometheusStatLogger):
-
-            def __init__(self, *args, **kwargs):
-                self._i = 0
-                super().__init__(*args, **kwargs)
-
-            def log(self, *args, **kwargs):
-                self._i += 1
-                return super().log(*args, **kwargs)
-
-        engine_args = EngineArgs(
-            model=model,
-            dtype=dtype,
-            disable_log_stats=False,
-        )
-        engine = LLMEngine.from_engine_args(engine_args)
-        logger = _RayPrometheusStatLogger(
-            local_interval=0.5,
-            labels=dict(model_name=engine.model_config.served_model_name),
-            vllm_config=engine.vllm_config)
-        engine.add_logger("ray", logger)
-        for i, prompt in enumerate(example_prompts):
-            engine.add_request(
-                f"request-id-{i}",
-                prompt,
-                SamplingParams(max_tokens=max_tokens),
-            )
-        while engine.has_unfinished_requests():
-            engine.step()
-        assert logger._i > 0, ".log must be called at least once"
-
-    ray.get(_inner.remote())
diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py
deleted file mode 100644
index 532ebba038d3..000000000000
--- a/tests/model_executor/test_logits_processor.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available
-
-
-class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
-        super().__init__(vocab_size=vocab_size, scale=scale)
-        self.fake_logits = fake_logits.clone()
-
-    def forward(self, *args, **kwargs):
-        with patch(
-                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
-                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
-            return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    return input_tensor, fake_logits, logits_processor
-
-
-RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_logits_processors(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
-
-    # This sample logits processor gives infinite score to the i-th token,
-    # where i is the length of the input sequence.
-    # We therefore expect the output token sequence to be [0, 1, 2, ...]
-    def pick_ith(token_ids, logits):
-        logits[len(token_ids)] = float("inf")
-        return logits
-
-    seq_group_metadata_list = []
-    seq_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
-
-    assert torch.isinf(logits_processor_output[:, 0]).all()
-
-    fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
diff --git a/tests/mq_llm_engine/__init__.py b/tests/mq_llm_engine/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
deleted file mode 100644
index 375b248ebeda..000000000000
--- a/tests/mq_llm_engine/conftest.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
deleted file mode 100644
index 5ff08cbb3248..000000000000
--- a/tests/mq_llm_engine/test_abort.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that aborting is handled properly."""
-
-import asyncio
-import tempfile
-import uuid
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
-from vllm.engine.arg_utils import AsyncEngineArgs
-
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
-RAISED_ERROR = KeyError
-RAISED_VALUE = "foo"
-EXPECTED_TOKENS = 250
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-@pytest.mark.asyncio
-async def test_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        request_id_to_be_aborted = "request-aborted"
-        request_ids_a = [f"request-a-{idx}" for idx in range(10)]
-        request_ids_b = [f"request-b-{idx}" for idx in range(10)]
-
-        # Requests started before one to be aborted.
-        tasks = []
-        for request_id in request_ids_a:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
-
-        # Aborted.
-        task_aborted = asyncio.create_task(
-            generate(client, request_id_to_be_aborted, EXPECTED_TOKENS))
-
-        # Requests started after one to be aborted.
-        for request_id in request_ids_b:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, EXPECTED_TOKENS)))
-
-        # Actually abort.
-        await asyncio.sleep(0.5)
-        await client.abort(request_id_to_be_aborted)
-
-        # Confirm that we got all the EXPECTED tokens from the requests.
-        for task in tasks:
-            count, request_id = await task
-            assert count == EXPECTED_TOKENS, (
-                f"{request_id} generated only {count} tokens")
-
-        # Cancel task (this will hang indefinitely if not).
-        task_aborted.cancel()
-
-        # Shutdown.
-        client.close()
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
deleted file mode 100644
index 3feee01dadf7..000000000000
--- a/tests/mq_llm_engine/test_error_handling.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that various errors are handled properly."""
-
-import asyncio
-import tempfile
-import time
-import uuid
-from unittest.mock import Mock
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.multiprocessing import MQEngineDeadError
-from vllm.engine.multiprocessing.engine import MQLLMEngine
-from vllm.entrypoints.openai.api_server import build_async_engine_client
-from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.lora.request import LoRARequest
-from vllm.sequence import SequenceGroupMetadata
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser
-
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
-RAISED_ERROR = KeyError
-RAISED_VALUE = "foo"
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-def run_with_evil_forward(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during first forward pass.
-    engine.engine.model_executor.execute_model = Mock(
-        side_effect=RAISED_ERROR(RAISED_VALUE))
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_evil_forward(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_forward) as engine:
-
-        client = await engine.make_client()
-
-        # Server should be healthy after initial probe.
-        await asyncio.sleep(2.0)
-        await client.check_health()
-
-        # Throws an error that should get ENGINE_DEAD_ERROR.
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-        assert client.errored
-
-        await asyncio.sleep(1.0)
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-        assert client.errored
-
-        # Shutdown.
-        client.close()
-
-
-def run_with_evil_model_executor_health(engine_args: AsyncEngineArgs,
-                                        ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during first forward pass.
-    engine.engine.model_executor.check_health = Mock(side_effect=RAISED_ERROR)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_health_check(tmp_socket):
-    with RemoteMQLLMEngine(
-            engine_args=ENGINE_ARGS,
-            ipc_path=tmp_socket,
-            run_fn=run_with_evil_model_executor_health) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # Health probe should throw RAISED_ERROR.
-        await asyncio.sleep(15.)
-
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-        assert client.errored
-
-        # Generate call should throw ENGINE_DEAD_ERROR
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        client.close()
-
-
-def run_with_evil_abort(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Raise error during abort call.
-    engine.engine.abort_request = Mock(side_effect=RAISED_ERROR)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_abort(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # First check health should work.
-        await client.check_health()
-
-        # Trigger an abort on the client side.
-        # This request ID does not exist, and will cause the engine to error
-        await client.abort(request_id="foo")
-
-        # Future generation requests will now fail
-        # with reference to the original KeyError("foo")
-        with pytest.raises(MQEngineDeadError) as execinfo:
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
-                pass
-        assert "KeyError" in repr(execinfo.value)
-        assert client.errored
-
-        # This should raise the original error.
-        with pytest.raises(RAISED_ERROR):
-            await client.check_health()
-
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_batch_error(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_abort) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # First check health should work.
-        await client.check_health()
-
-        # Batch of requests
-        async def do_generate(client):
-            # min_tokens=2048 to keep busy the engine busy
-            # to get enough time to get process a request
-            # that will crash the engine
-            params = SamplingParams(min_tokens=2048, max_tokens=2048)
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=params,
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)]
-
-        # This request will force a processing batch to raise
-        # an exception and next the engine get errored
-        await client.abort(request_id="foo")
-
-        # The batch of those request failed, then they
-        # should get the same exception as a MQEngineDeadError.
-        errors = await asyncio.gather(*tasks, return_exceptions=True)
-        for e in errors:
-            assert isinstance(e, MQEngineDeadError)
-            assert "KeyError" in repr(e)
-
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_bad_request(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        # Invalid request should fail, but not crash the server.
-        with pytest.raises(ValueError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id="abcd-1",
-                                           lora_request=LoRARequest(
-                                               "invalid-lora", 1,
-                                               "invalid-path")):
-                pass
-
-        # This request should be okay.
-        async for _ in client.generate(prompt="Hello my name is",
-                                       sampling_params=SamplingParams(),
-                                       request_id="abcd-2"):
-            pass
-
-        # Shutdown.
-        client.close()
-
-
-@pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-
-        parser = FlexibleArgumentParser(
-            description="vLLM's remote OpenAI server.")
-        parser = make_arg_parser(parser)
-        args = parser.parse_args([])
-
-        # When LLMEngine is loaded, it will crash.
-        def mock_init():
-            raise ValueError
-
-        m.setattr(LLMEngine, "__init__", mock_init)
-
-        start = time.perf_counter()
-        async with build_async_engine_client(args):
-            pass
-        end = time.perf_counter()
-
-        assert end - start < 60, (
-            "Expected vLLM to gracefully shutdown in <60s "
-            "if there is an error in the startup.")
-
-
-@pytest.mark.asyncio
-async def test_mp_cuda_init():
-    # it should not crash, when cuda is initialized
-    # in the API server process
-    import torch
-    torch.cuda.init()
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
-
-    async with build_async_engine_client(args):
-        pass
-
-
-@pytest.mark.asyncio
-async def test_engine_process_death(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # kill the engine process
-        engine.proc.kill()
-
-        # Generate call should fail
-        with pytest.raises(MQEngineDeadError):
-            async for _ in client.generate(prompt="Hello my name is",
-                                           sampling_params=SamplingParams(),
-                                           request_id=str(uuid.uuid4())):
-                pass
-
-        # And the health check should show the engine is dead
-        with pytest.raises(RuntimeError, match="Engine process .* died"):
-            await client.check_health()
-
-        client.close()
-
-
-def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
-                                   ipc_path: str):
-    """Simulate an exception while preparing inputs for the model.
-    In the wild, this could be something like a multimodal input processor
-    failing on invalid image data."""
-
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    runner = engine.engine.model_executor.driver_worker.worker.model_runner
-
-    # Raise error in the model runner when adding a sequence group.
-    # See class ModelInputForGPUBuilder
-    def raiser(_, seq_group_metadata: SequenceGroupMetadata):
-        if seq_group_metadata.request_id.startswith("evil"):
-            raise RAISED_ERROR(RAISED_VALUE)
-
-    runner.builder.per_seq_group_compute_fns.append(raiser)
-
-    # Run engine.
-    engine.start()
-
-
-@pytest.mark.asyncio
-async def test_failed_inputs(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket,
-                           run_fn=run_with_evil_input_processing) as engine:
-
-        client = await engine.make_client()
-        assert client.is_running
-
-        # Engine should be healthy
-        await client.check_health()
-
-        async def run_failing_request():
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id="evil" + str(uuid.uuid4())):
-                pass
-
-        async def run_passing_request():
-            async for _ in client.generate(
-                    prompt="Hello my name is",
-                    sampling_params=SamplingParams(max_tokens=10),
-                    request_id=str(uuid.uuid4())):
-                pass
-
-        passing_tasks = [
-            asyncio.create_task(run_passing_request()) for _ in range(10)
-        ]
-        failing_tasks = [
-            asyncio.create_task(run_failing_request()) for _ in range(10)
-        ]
-        await asyncio.gather(*failing_tasks, return_exceptions=True)
-        await asyncio.gather(*passing_tasks)
-
-        # All the bad inputs should have raised
-        for task in failing_tasks:
-            with pytest.raises(RAISED_ERROR):
-                task.result()
-
-        # But all good inputs should have still succeeded
-        for task in passing_tasks:
-            task.result()
-
-        # And the engine should remain healthy
-        assert not client.errored
-        await client.check_health()
-
-        client.close()
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
deleted file mode 100644
index e9fd5b814f28..000000000000
--- a/tests/mq_llm_engine/test_load.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test that the MQLLMEngine is able to handle 10k concurrent requests."""
-
-import asyncio
-import tempfile
-import uuid
-
-import pytest
-
-from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
-from vllm.engine.arg_utils import AsyncEngineArgs
-
-MODEL = "google/gemma-1.1-2b-it"
-NUM_EXPECTED_TOKENS = 10
-NUM_REQUESTS = 10000
-
-# Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
-
-
-@pytest.fixture(scope="function")
-def tmp_socket():
-    with tempfile.TemporaryDirectory() as td:
-        yield f"ipc://{td}/{uuid.uuid4()}"
-
-
-@pytest.mark.asyncio
-async def test_load(tmp_socket):
-    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
-                           ipc_path=tmp_socket) as engine:
-
-        client = await engine.make_client()
-
-        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
-
-        # Create concurrent requests.
-        tasks = []
-        for request_id in request_ids:
-            tasks.append(
-                asyncio.create_task(
-                    generate(client, request_id, NUM_EXPECTED_TOKENS)))
-
-        # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
-        for task in tasks:
-            num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
-
-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
-
-        # Shutdown.
-        client.close()
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
deleted file mode 100644
index 7976d5031aea..000000000000
--- a/tests/mq_llm_engine/utils.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import multiprocessing
-from typing import Callable, Union
-
-from vllm import SamplingParams
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.multiprocessing.client import MQLLMEngineClient
-from vllm.engine.multiprocessing.engine import MQLLMEngine
-from vllm.outputs import RequestOutput
-from vllm.usage.usage_lib import UsageContext
-
-
-async def generate(
-        client: MQLLMEngineClient,
-        request_id: str,
-        num_tokens: int,
-        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
-
-    final_output = None
-    count = 0
-    async for out in client.generate(
-            request_id=request_id,
-            prompt="Hello my name is Robert and",
-            sampling_params=SamplingParams(max_tokens=num_tokens,
-                                           temperature=0)):
-
-        count += 1
-        final_output = out
-        await asyncio.sleep(0.)
-
-    if return_output:
-        return final_output
-
-    # Confirm we generated all the tokens we expected.
-    return count, request_id
-
-
-def run_normal(engine_args: AsyncEngineArgs, ipc_path: str):
-    # Make engine.
-    engine = MQLLMEngine.from_engine_args(
-        engine_args=engine_args,
-        usage_context=UsageContext.UNKNOWN_CONTEXT,
-        ipc_path=ipc_path)
-
-    # Run engine.
-    engine.start()
-
-
-class RemoteMQLLMEngine:
-
-    def __init__(self,
-                 engine_args: AsyncEngineArgs,
-                 ipc_path: str,
-                 run_fn: Callable = run_normal) -> None:
-
-        self.engine_args = engine_args
-        self.ipc_path = ipc_path
-        context = multiprocessing.get_context("spawn")
-        self.proc = context.Process(target=run_fn,
-                                    args=(engine_args, ipc_path))
-        self.proc.start()
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.proc.kill()
-
-    async def make_client(self) -> MQLLMEngineClient:
-        engine_config = self.engine_args.create_engine_config()
-        client = MQLLMEngineClient(self.ipc_path, engine_config, self.proc.pid)
-        while True:
-            try:
-                await client.setup()
-                break
-            except TimeoutError:
-                assert self.proc.is_alive()
-        return client
diff --git a/tests/multi_step/__init__.py b/tests/multi_step/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
deleted file mode 100644
index 56e339d485c5..000000000000
--- a/tests/multi_step/test_correctness_async_llm.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the AsyncLLMEngine with multi-step-decoding
-from typing import Optional
-
-import pytest
-
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close
-from ..utils import (completions_with_server_args, get_client_text_generations,
-                     get_client_text_logprob_generations)
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-DEFAULT_SERVER_ARGS: list[str] = [
-    "--distributed-executor-backend",
-    "ray",
-    "--gpu-memory-utilization",
-    "0.85",
-    "--swap-space",
-    "16",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 1),
-    (2, 2),
-])
-@pytest.mark.parametrize("eager_mode", [False, True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("is_async", [True])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.asyncio
-async def test_multi_step(
-    example_prompts,
-    model: str,
-    tp_size: int,
-    pp_size: int,
-    eager_mode: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    is_async: bool,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    enable_chunked_prefill: bool,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
-    client/server environment.
-
-    Set up an engine with single-step scheduling as a ground-truth reference.
-
-    Send a completions API request to both engines with the same prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-    """
-    if enable_chunked_prefill and \
-        (pp_size > 1 or attention_backend != "FLASH_ATTN"):
-        pytest.skip("Multi-step with Chunked-Prefill only supports"
-                    "PP=1 and FLASH_ATTN backend")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-        if not is_async:
-            ms_server_args += ["--disable-async-output-proc"]
-
-        if eager_mode:
-            ms_server_args.append("--enforce-eager")
-
-        if enable_chunked_prefill:
-            ms_server_args.append("--enable-chunked-prefill")
-
-        distributed_args = [
-            "--tensor-parallel-size",
-            str(tp_size),
-            "--pipeline-parallel-size",
-            str(pp_size),
-        ]
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 5x to 1200 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts,
-            model,
-            server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-        test_completions = await completions_with_server_args(
-            prompts,
-            model,
-            ms_server_args + distributed_args,
-            num_logprobs,
-            max_wait_seconds=5 * 240)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-        assert ref_generations == test_generations
-
-        # Assert multi-step scheduling produces nearly-identical logprobs
-        # to single-step scheduling.
-        ref_text_logprobs = get_client_text_logprob_generations(
-            ref_completions)
-        test_text_logprobs = get_client_text_logprob_generations(
-            test_completions)
-        check_logprobs_close(
-            outputs_0_lst=ref_text_logprobs,
-            outputs_1_lst=test_text_logprobs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize(("tp_size, pp_size"), [
-    (1, 2),
-])
-@pytest.mark.asyncio
-async def test_multi_step_pp_smoke(
-    tp_size: int,
-    pp_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Smoke test for the vLLM engine with multi-step scheduling in an
-    OpenAI-protocol client/server environment.
-
-    This tests compares the outputs between multi-step scheduling and
-    single-step scheduling. Notably, this test lets the engines generate
-    more tokens (default is 5) and test for an exact match over all the
-    tokens.
-
-    Args:
-      tp_size: degree of tensor-parallelism
-      pp_size: degree of pipeline-parallelism
-      eager_mode
-    """
-
-    model = "JackFram/llama-160m"
-    num_scheduler_steps = 8
-    attention_backend = "FLASH_ATTN"
-    max_num_seqs = 3
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        # Prompt from the ShareGPT dataset
-        prompts = [
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-            "in the jtbd context whats a push?",  # codespell:ignore
-        ]
-        # Use varying max_tokens to introduce scheduling randomness.
-        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-        assert len(prompts) == len(max_tokens)
-
-        test_args = [
-            "--tensor-parallel-size",
-            str(tp_size), "--pipeline-parallel-size",
-            str(pp_size), "--max-num-seqs",
-            str(max_num_seqs)
-        ]
-
-        server_args = DEFAULT_SERVER_ARGS + test_args
-        ms_server_args = DEFAULT_SERVER_ARGS + \
-          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-          test_args
-
-        # Spin up client/server & issue completion API requests.
-        # Default `max_wait_seconds` is 240 but was empirically
-        # was raised 3x to 720 *just for this test* due to
-        # observed timeouts in GHA CI
-        ref_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        test_completions = await completions_with_server_args(
-            prompts=prompts,
-            model_name=model,
-            server_cli_args=ms_server_args,
-            num_logprobs=None,
-            max_wait_seconds=5 * 240,
-            max_tokens=max_tokens)
-
-        # Assert multi-step scheduling produces identical tokens
-        # to single-step scheduling.
-        ref_generations = get_client_text_generations(ref_completions)
-        test_generations = get_client_text_generations(test_completions)
-
-        assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
deleted file mode 100644
index 0df00c98b72c..000000000000
--- a/tests/multi_step/test_correctness_llm.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Test the LLMEngine with multi-step-decoding
-
-import copy
-from typing import Optional
-
-import pytest
-
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_logprobs_close, check_outputs_equal
-
-MODELS = [
-    "JackFram/llama-160m",
-]
-NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
-NUM_PROMPTS = [10]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("enable_chunked_prefill", [False, True])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_multi_step_llm(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    enable_chunked_prefill: bool,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step scheduling via sync LLM Engine.
-
-    Set up a HuggingFace (HF) transformers model as a ground-truth reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * Generated tokens match
-    * Generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      enable_chunked_prefill: chunked-prefill on/off
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-    if current_platform.is_rocm() and \
-        (attention_backend == "FLASHINFER" or enable_chunked_prefill):
-        pytest.skip(
-            "Multi-Step with FLASHINFER or Chunked-Prefill is not supported"
-            "on ROCm")
-
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=enable_chunked_prefill,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                            if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                prompts, max_tokens, num_logprobs))
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                          if num_logprobs is None else
-                          hf_model.generate_greedy_logprobs_limit(
-                              prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            check_outputs_equal(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-        else:
-            check_logprobs_close(
-                outputs_0_lst=hf_outputs,
-                outputs_1_lst=vllm_outputs,
-                name_0="hf",
-                name_1="vllm",
-            )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs,num_prompt_logprobs", [(5, 5)])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-def test_multi_step_llm_w_prompt_logprobs(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
-
-    Set up a vLLM engine instance w/ single-step scheduling as a ground-truth
-    reference.
-
-    Prompt them with the same example prompts.
-
-    Validate:
-    * All generated logprobs are all very close
-
-    Args:
-      hf_runner: HF transformers model runner fixture
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> no logprobs
-      num_prompt_logprobs: number of logprobs to return for each prompt token;
-                           note that this argument is not supported by the
-                           OpenAI completions endpoint.
-    """
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        prompts = example_prompts
-        if len(prompts) < num_prompts:
-            prompts = prompts * ((num_prompts // len(prompts)) + 1)
-        prompts = prompts[:num_prompts]
-        assert len(prompts) == num_prompts
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-        ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-        ) as vllm_model:
-            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-                prompts,
-                max_tokens,
-                num_logprobs,
-                num_prompt_logprobs=num_prompt_logprobs)
-
-        check_logprobs_close(
-            outputs_0_lst=single_step_vllm_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp_size", [1])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS)
-@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
-@pytest.mark.parametrize("num_logprobs", [None, 5])
-@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
-@pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="Multi-Step + Chunked-Prefill not supported on ROCm")
-def test_multi_step_llm_chunked_prefill_prefix_cache(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp_size: int,
-    max_tokens: int,
-    enforce_eager: int,
-    num_scheduler_steps: int,
-    num_prompts: int,
-    num_logprobs: Optional[int],
-    attention_backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
-
-    Set up contrived scenario which tests for a possible failure mode of
-    scheduling with multi-step+"single-step chunked prefill"+APC
-
-    "single-step chunked prefill" here refers to the current vLLM multi-step+
-    chunked-prefill implementation, which requires that a prefill may only
-    be scheduled in the same step as decodes if the prefill prompt fits in a
-    single chunk (note that "complete" multi-step+chunked-prefill would allow
-    a prefill to span multiple chunks & multiple steps but that is not yet
-    the case.)
-
-    "APC" is short for "automatic prefix caching".
-
-    This test creates a scenario where the scheduler must decide whether/how
-    to schedule a prefill with a prompt that exceeds the available token budget.
-    The correct behavior for multi-step+"single-step chunked prefill"+APC is to
-    put off scheduling the prefill until a future step.
-
-    Validate that:
-    * Multi-step kernels do not raise an exception due to incorrect scheduler
-      behavior
-    * Generated tokens match between
-      multi-step+"single-step chunked prefill"+APC and
-      single-step scheduling.
-    * (If logprobs are enabled) check logprobs are close enough
-
-    Args:
-      vllm_runner: vLLM model runner fixture
-      example_prompts: test fixture providing example prompts
-      model: model under test (same for single- and multi-step engines)
-      dtype: tensor datatype for engine to utilize
-      tp_size: degree of tensor-parallelism
-      max_tokens: the maximum number of tokens to generate
-      enforce_eager
-      num_scheduler_steps: for multi-step scheduling, GPU-side steps per
-                           GPU -> CPU output transfer
-      num_prompts: number of example prompts under test
-      num_logprobs: corresponds to the `logprobs` argument to the OpenAI
-                    completions endpoint; `None` -> 1 logprob returned.
-    """
-
-    # Set up contrived test for correct scheduling behavior with
-    # multi-step+"single-step chunked prefill"+APC.
-    #
-    # Assume block_size=16
-    #
-    # Assume max_num_batched_tokens=48
-    #   => Per-step token budget=48
-    #
-    # 1. Scheduler schedules 0th prompt (24 tokens)
-    #      => Remaining token budget=24
-    # 2. Scheduler attempts to schedule 1st prompt (30 tokens)
-    #    * 30 tokens exceeds 24 token remaining budget
-    #    * Correct behavior: do not schedule this prompt in this step
-    #    * Incorrect behavior: schedule prompt chunk
-    #      * `do_sample=False` for this prompt in this step
-    #      * Chunk size = (remaining tokens // block size) * block size
-    #
-    # The Incorrect scheduling behavior - if it occurs - will cause an exception
-    # in the model runner resulting from `do_sample=False`.
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
-
-        assert len(example_prompts) >= 2
-        challenge_prompts = copy.deepcopy(example_prompts)
-        challenge_prompts[0] = (
-            'vLLM is a high-throughput and memory-efficient '
-            'inference and serving engine for LLMs.\n')  # 24 tok
-        challenge_prompts[1] = (
-            'Briefly describe the major milestones in the '
-            'development of artificial intelligence from 1950 to 2020.\n'
-        )  # 30 tok
-
-        # If necessary, adjust the length of `challenge_prompts` to match
-        # `num_prompts`
-        if len(challenge_prompts) < num_prompts:
-            challenge_prompts = (challenge_prompts *
-                                 ((num_prompts // len(challenge_prompts)) + 1))
-        challenge_prompts = challenge_prompts[:num_prompts]
-        assert len(challenge_prompts) == num_prompts
-
-        # Single-step scheduler baseline
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_baseline = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        # multi-step+"single-step chunked prefill"+APC
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enforce_eager=enforce_eager,
-                gpu_memory_utilization=0.7,
-                tensor_parallel_size=tp_size,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                num_scheduler_steps=num_scheduler_steps,
-                max_model_len=48,
-                max_num_batched_tokens=48,
-                max_num_seqs=4,
-                block_size=16,
-        ) as vllm_model:
-            outputs_w_features = (
-                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
-                num_logprobs is None else vllm_model.generate_greedy_logprobs(
-                    challenge_prompts, max_tokens, num_logprobs))
-
-        if num_logprobs is None:
-            # No-logprobs test
-            check_outputs_equal(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
-        else:
-            # Yes-logprobs test
-            check_logprobs_close(
-                outputs_0_lst=outputs_baseline,
-                outputs_1_lst=outputs_w_features,
-                name_0="multi-step",
-                name_1="multi-step+features",
-            )
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 8c2121610868..e04f93099e63 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -3,47 +3,18 @@
 
 import pytest
 
-from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
 
 
-class DummyV0Scheduler(Scheduler):
-
-    def schedule(self):
-        raise Exception("Exception raised by DummyV0Scheduler")
-
-
 class DummyV1Scheduler(V1Scheduler):
 
     def schedule(self):
         raise Exception("Exception raised by DummyV1Scheduler")
 
 
-def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "0")
-        with pytest.raises(Exception) as exception_info:
-
-            engine_args = EngineArgs(
-                model="facebook/opt-125m",
-                enforce_eager=True,  # reduce test time
-                scheduler_cls=DummyV0Scheduler,
-            )
-
-            engine = LLMEngine.from_engine_args(engine_args=engine_args)
-
-            sampling_params = SamplingParams(max_tokens=1)
-            engine.add_request("0", "foo", sampling_params)
-            engine.step()
-
-        assert str(
-            exception_info.value) == "Exception raised by DummyV0Scheduler"
-
-
 def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
diff --git a/tests/prefix_caching/__init__.py b/tests/prefix_caching/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
deleted file mode 100644
index b940ab416e67..000000000000
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-import pytest
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-
-MODEL_LEN_LEN = [
-    # Example models with sliding window.
-    ("bigcode/starcoder2-3b", 4096, 16384),
-    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
-
-    # Confirm model with sliding window works.
-    # config has "use_sliding_window": false
-    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
-    # config has no sliding window attribute.
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
-]
-
-
-@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
-def test_disable_sliding_window(model_len_len, ):
-    model, sliding_len, full_len = model_len_len
-    disabled_llm = LLM(model, disable_sliding_window=True)
-    disabled_llm.generate("Hi my name is")
-    model_config = disabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == sliding_len, (
-        "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
-        model_config.max_model_len)
-
-    del disabled_llm
-    cleanup_dist_env_and_memory()
-
-    enabled_llm = LLM(model,
-                      enforce_eager=True,
-                      disable_sliding_window=False,
-                      enable_prefix_caching=False)
-    enabled_llm.generate("Hi my name is")
-    model_config = enabled_llm.llm_engine.model_config
-    assert model_config.max_model_len == full_len, (
-        "Max len expected to equal full_len of %s, but got %s", full_len,
-        model_config.max_model_len)
-
-    del enabled_llm
-    cleanup_dist_env_and_memory()
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
deleted file mode 100644
index 5bf6ed957c74..000000000000
--- a/tests/prefix_caching/test_prefix_caching.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the with and without prefix caching.
-
-Run `pytest tests/prefix_caching/test_prefix_caching.py`.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-from tests.conftest import VllmRunner
-from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from vllm import SamplingParams, TokensPrompt
-from vllm.core.scheduler import Scheduler
-from vllm.engine.llm_engine import LLMEngine
-from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
-
-from ..models.utils import check_outputs_equal
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch: pytest.MonkeyPatch):
-    """
-    This module relies on V0 internals, so set VLLM_USE_V1=0.
-    """
-    with monkeypatch.context() as m:
-        m.setenv('VLLM_USE_V1', '0')
-        yield
-
-
-MODELS = [
-    "distilbert/distilgpt2",
-]
-
-UNSTABLE_PROMPT_SEQUENCE = [
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50),
-    ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95),
-    ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174),
-    ([0] * 588) + ([8] * 1539),
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("cached_position", [0, 1])
-@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
-@pytest.mark.parametrize("block_size", [16])
-def test_mixed_requests(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    backend: str,
-    dtype: str,
-    max_tokens: int,
-    cached_position: int,
-    enable_chunked_prefill: bool,
-    block_size: int,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    """
-    Test the case when some sequences have the prefix cache hit
-    and the others don't. The cached position determines where
-    the sequence is at among the batch of prefills.
-    """
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        cached_prompt = example_prompts[cached_position]
-        with vllm_runner(
-                model,
-                dtype=dtype,
-                enable_prefix_caching=True,
-                enable_chunked_prefill=enable_chunked_prefill,
-                block_size=block_size,
-        ) as vllm_model:
-            # Run the first prompt so the cache is populated
-            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
-                                                      max_tokens)
-
-            # Run all the promopts
-            greedy_params = SamplingParams(temperature=0.0,
-                                           max_tokens=max_tokens)
-            req_outputs = vllm_model.llm.generate(example_prompts,
-                                                  greedy_params)
-
-            # Verify number of cached tokens
-            for i in range(len(req_outputs)):
-                if i == cached_position:
-                    expected_num_cached_tokens = (
-                        len(req_outputs[i].prompt_token_ids) //
-                        block_size) * block_size
-                else:
-                    expected_num_cached_tokens = 0
-                assert (req_outputs[i].num_cached_tokens ==
-                        expected_num_cached_tokens)
-
-            vllm_outputs = [(
-                output.prompt_token_ids + list(output.outputs[0].token_ids),
-                output.prompt + output.outputs[0].text,
-            ) for output in req_outputs]
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-
-
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_unstable_prompt_sequence(
-    vllm_runner,
-    backend: str,
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-
-    if backend == "FLASHINFER" and current_platform.is_rocm():
-        pytest.skip("Flashinfer does not support ROCm/HIP.")
-    if backend == "XFORMERS" and current_platform.is_rocm():
-        pytest.skip("Xformers does not support ROCm/HIP.")
-    with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
-
-        with vllm_runner(
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                enable_chunked_prefill=True,
-                enable_prefix_caching=True,
-                max_model_len=4096,
-        ) as vllm_model:
-            for prompt in UNSTABLE_PROMPT_SEQUENCE:
-                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                    SamplingParams(max_tokens=1))
-
-
-@pytest.mark.parametrize("model", MODELS)
-def test_fully_cached_prefill_needs_uncached_token(model):
-    block_size = 16
-    max_num_batched_tokens = 16
-    num_output_tokens = 5
-    # Make a vllm engine
-    runner = VllmRunner(
-        model_name=model,
-        gpu_memory_utilization=0.7,
-        enable_chunked_prefill=True,
-        enforce_eager=True,
-        enable_prefix_caching=True,
-        block_size=block_size,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_batched_tokens,
-    )
-    engine: LLMEngine = runner.llm.llm_engine
-
-    scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
-    engine.scheduler[0] = scheduler
-
-    # SeqA
-    seqA_tokens = list(range(2 * block_size))
-    seqA, seq_groupA = create_dummy_prompt(
-        request_id="0",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupA)
-
-    assert seqA.data.get_num_computed_tokens() == 0
-
-    # Prefill seqA
-    while not seqA.is_finished():
-        engine.step()
-
-    # seqB
-    seqB_tokens = [t + 1 for t in seqA_tokens]  # shift by 1
-    seqB, seq_groupB = create_dummy_prompt(
-        request_id="1",
-        prompt_tokens=seqB_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    # seqC is the same as seqA
-    seqC, seq_groupC = create_dummy_prompt(
-        request_id="2",
-        prompt_tokens=seqA_tokens,
-        max_tokens=num_output_tokens,
-        block_size=block_size,
-    )
-
-    scheduler.add_seq_group(seq_groupB)
-    scheduler.add_seq_group(seq_groupC)
-
-    # Even seqC is fully cached, it should not be prefilled since we
-    # require at least 1 uncached token.
-    engine.step()
-
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupB.request_id)
-    assert (sched_out.scheduled_seq_groups[0].token_chunk_size ==
-            max_num_batched_tokens)
-
-    # When seqB is finished, seqC could be prefilled.
-    while not seqB.is_finished():
-        engine.step()
-        sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-        assert len(sched_out.scheduled_seq_groups) == 1
-        assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-                seq_groupB.request_id)
-
-    engine.step()
-    sched_metas, sched_out, _ = scheduler.last_schedule_ret()
-    assert len(sched_out.scheduled_seq_groups) == 1
-    assert (sched_out.scheduled_seq_groups[0].seq_group.request_id ==
-            seq_groupC.request_id)
-    assert sched_out.scheduled_seq_groups[0].token_chunk_size == len(
-        seqA_tokens)
diff --git a/tests/samplers/__init__.py b/tests/samplers/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
deleted file mode 100644
index bdf48c7687b2..000000000000
--- a/tests/samplers/test_beam_search.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of HF and vLLM when using beam search.
-
-Run `pytest tests/samplers/test_beam_search.py`.
-"""
-
-import pytest
-from transformers import AutoModelForSeq2SeqLM
-
-from vllm.assets.audio import AudioAsset
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
-# FIXME(zhuohan): The test can not pass if we:
-#   1. Increase max_tokens to 256.
-#   2. Increase beam_width to 8.
-#   3. Use the model "huggyllama/llama-7b".
-MAX_TOKENS = [64]
-BEAM_WIDTHS = [4]
-MM_BEAM_WIDTHS = [2]
-MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
-
-
-@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
-@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
-def test_beam_search_single_input(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-) -> None:
-    example_prompts = example_prompts[:1]
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                       beam_width, max_tokens)
-
-    for i in range(len(example_prompts)):
-        hf_output_ids, hf_output_texts = hf_outputs[i]
-        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
-            print(f">>>{j}-th hf output:")
-            print(hf_text)
-            print(f">>>{j}-th vllm output:")
-            print(vllm_text)
-        assert len(hf_output_ids) == len(vllm_output_ids)
-        for j in range(len(hf_output_ids)):
-            assert hf_output_ids[j] == vllm_output_ids[j], (
-                f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
-                f"vLLM: {vllm_output_ids}")
-
-
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
-@pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
-def test_beam_search_passes_multimodal_data(
-    hf_runner,
-    vllm_runner,
-    dtype: str,
-    max_tokens: int,
-    beam_width: int,
-) -> None:
-    """Ensure that beam search passes multimodal data through correctly."""
-    # NOTE - this test is primarily to check that mm data is passed to beams
-    # correctly. As such, we just need to check one extra modality to make
-    # sure things pass through properly.
-    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
-    model = "Qwen/Qwen2-Audio-7B-Instruct"
-    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
-    prompts = [
-        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
-    ]
-
-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSeq2SeqLM) as hf_model:
-        audio_token_id = hf_model.config.audio_token_index
-        eos_token_id = hf_model.tokenizer.eos_token_id  # <|im_end|>
-        hf_outputs = hf_model.generate_beam_search(
-            prompts,
-            beam_width=beam_width,
-            max_tokens=max_tokens,
-            audios=audios,
-        )
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_beam_search(
-            prompts,
-            beam_width=beam_width,
-            max_tokens=max_tokens,
-            audios=audios,
-        )
-
-    seq_with_no_audio_toks = lambda seq: [
-        tok for tok in seq if tok != audio_token_id
-    ]
-
-    for i in range(len(prompts)):
-        hf_output_ids, hf_output_texts = hf_outputs[i]
-        vllm_output_ids, vllm_output_texts = vllm_outputs[i]
-
-        for j, (hf_text,
-                vllm_text) in enumerate(zip(hf_output_texts,
-                                            vllm_output_texts)):
-            print(f">>>{j}-th hf output [NOTE: special tokens are filtered]:")
-            print(hf_text)
-            print(f">>>{j}-th vllm output:")
-            print(vllm_text)
-        assert len(hf_output_ids) == len(vllm_output_ids)
-
-        for j in range(len(hf_output_ids)):
-            # Compare everything except for the audio tokens; we do this since
-            # the IDs returned from the transformers helper expands the audio
-            # token to match features, while the vLLM helper maintains the
-            # single audio token in the input text
-            filtered_hf_output_ids = seq_with_no_audio_toks(hf_output_ids[j])
-            filtered_vllm_output_ids = seq_with_no_audio_toks(
-                vllm_output_ids[j])
-
-            # HF output IDs may contain the end of sequence
-            if len(filtered_hf_output_ids
-                   ) == len(filtered_vllm_output_ids) + 1:
-                assert filtered_hf_output_ids[-1] == eos_token_id
-                filtered_hf_output_ids = filtered_hf_output_ids[:-1]
-
-            assert filtered_hf_output_ids == filtered_vllm_output_ids
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
deleted file mode 100644
index ea4a17dd2306..000000000000
--- a/tests/samplers/test_ignore_eos.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Make sure ignore_eos works.
-
-Run `pytest tests/samplers/test_ignore_eos.py`.
-"""
-
-import pytest
-
-from vllm import SamplingParams
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
-# We also test with llama because it has generation_config to specify EOS
-# (past regression).
-MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [512])
-def test_ignore_eos(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        sampling_params = SamplingParams(max_tokens=max_tokens,
-                                         ignore_eos=True)
-
-        for prompt in example_prompts:
-            ignore_eos_output = vllm_model.llm.generate(
-                prompt, sampling_params=sampling_params)
-            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
-            assert output_length == max_tokens
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
deleted file mode 100644
index 123f9595e97b..000000000000
--- a/tests/samplers/test_logits_processor.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_logits_processor_force_generate(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        tokenizer = vllm_model.llm.get_tokenizer()
-        repeat_times = 2
-        enforced_answers = " vLLM"
-        vllm_token_ids = tokenizer.encode(enforced_answers,
-                                          add_special_tokens=False)
-        max_tokens = len(vllm_token_ids) * repeat_times
-
-        def pick_vllm(token_ids, logits):
-            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-            logits[token_id] = torch.finfo(logits.dtype).max
-            return logits
-
-        params_with_logprobs = SamplingParams(
-            logits_processors=[pick_vllm],
-            prompt_logprobs=3,
-            max_tokens=max_tokens,
-        )
-
-        # test logits_processors when prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[0],
-            params=params_with_logprobs,
-        )
-
-        # test prompt_logprobs is not None
-        vllm_model.llm._add_request(
-            example_prompts[1],
-            params=SamplingParams(
-                prompt_logprobs=3,
-                max_tokens=max_tokens,
-            ),
-        )
-
-        # test grouped requests
-        vllm_model.llm._add_request(
-            example_prompts[2],
-            params=SamplingParams(max_tokens=max_tokens),
-        )
-
-        outputs = vllm_model.llm._run_engine(use_tqdm=False)
-
-        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
deleted file mode 100644
index 87f40b100531..000000000000
--- a/tests/samplers/test_logprobs.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm import SamplingParams
-
-from ..conftest import VllmRunner
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module is V0 only since it uses dtype=float, so
-    set VLLM_USE_V1=0 for all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype",
-                         ["float"])  # needed for comparing logprobs with HF
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("num_top_logprobs", [0, 6])  # 32000 == vocab_size
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_get_prompt_logprobs(
-    hf_runner,
-    vllm_runner,
-    model,
-    dtype,
-    chunked_prefill_token_size: int,
-    num_top_logprobs: int,
-    detokenize: bool,
-    example_prompts,
-):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-
-    max_tokens = 5
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_logprobs = hf_model.generate_greedy_logprobs(
-            example_prompts,
-            max_tokens=max_tokens,
-        )
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_logprobs=num_top_logprobs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                              logprobs=num_top_logprobs,
-                                              prompt_logprobs=num_top_logprobs,
-                                              temperature=0.0,
-                                              detokenize=detokenize)
-        vllm_results = vllm_model.llm.generate(
-            example_prompts, sampling_params=vllm_sampling_params)
-
-    # Test whether logprobs are included in the results.
-    for result in vllm_results:
-        assert result.prompt_logprobs is not None
-        assert result.outputs[0].logprobs is not None
-        assert len(result.outputs[0].logprobs) == max_tokens
-        for logprobs in result.outputs[0].logprobs:
-            # If the output token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(logprobs) == num_top_logprobs
-                    or len(logprobs) == num_top_logprobs + 1)
-        output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens_lst: list[str] = []
-        for top_logprobs in result.outputs[0].logprobs:
-            top_logprob = next(iter(top_logprobs.values()))
-            output_string_from_most_likely_tokens_lst.append(
-                top_logprob.decoded_token)
-
-        if detokenize:
-            output_string_from_most_likely_tokens = "".join(
-                output_string_from_most_likely_tokens_lst)
-            assert output_text == output_string_from_most_likely_tokens, (
-                "The output text from the top logprob for each token position "
-                "should be the same as the output text in the result.")
-        else:
-            assert output_text == ''
-            assert output_string_from_most_likely_tokens_lst == ([None] *
-                                                                 max_tokens)
-
-        # The first prompt logprob is always None
-        assert result.prompt_logprobs[0] is None
-        for prompt_logprobs in result.prompt_logprobs[1:]:
-            # If the prompt token is not included in the top X
-            # logprob, it can return 1 more data
-            assert (len(prompt_logprobs) == num_top_logprobs
-                    or len(prompt_logprobs) == num_top_logprobs + 1)
-
-    # Test whether prompt logprobs are consistent with HF
-    for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
-        # Check prompt logprobs
-        # The first prompt logprob is always None, so we compare it from 1:.
-        vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
-        for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
-            for token_id, logprob in vllm_prompt_logprob_dict.items():
-                torch.testing.assert_close(logprob.logprob,
-                                           hf_logprob[0][i][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-        vllm_sample_logprobs = vllm_result.outputs[0].logprobs
-        for i, top_logprobs in enumerate(vllm_sample_logprobs):
-            for token_id, sample_logprob in top_logprobs.items():
-                logprob = sample_logprob.logprob
-                torch.testing.assert_close(logprob,
-                                           hf_logprob[i][-1][token_id].item(),
-                                           atol=1e-2,
-                                           rtol=1e-2)
-                if detokenize:
-                    assert isinstance(sample_logprob.decoded_token, str), (
-                        "The token should be decoded by the time it is returned"
-                        " to the user.")
-
-    # Test if prompt logprobs are correctly set.
-    for vllm_result in vllm_results:
-        token_ids = vllm_result.prompt_token_ids
-        prompt_logprobs = vllm_result.prompt_logprobs
-
-        # The first token doesn't have logprob.
-        assert prompt_logprobs[0] is None
-
-        for token_id, logprob_dict in zip(token_ids[1:], prompt_logprobs[1:]):
-            assert token_id in logprob_dict
-
-
-def test_max_logprobs():
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
-    vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
-
-    bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
-@pytest.mark.parametrize("detokenize", [True, False])
-def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
-                       detokenize: bool, example_prompts):
-    max_num_seqs = 256
-    enable_chunked_prefill = False
-    max_num_batched_tokens = None
-    if chunked_prefill_token_size != -1:
-        enable_chunked_prefill = True
-        max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
-        max_num_batched_tokens = chunked_prefill_token_size
-    max_tokens = 5
-
-    with vllm_runner(
-            model,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                       logprobs=None,
-                                                       temperature=0.0,
-                                                       detokenize=detokenize)
-        results_logprobs_none = vllm_model.llm.generate(
-            example_prompts, sampling_params=sampling_params_logprobs_none)
-
-    for i in range(len(results_logprobs_none)):
-        assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
deleted file mode 100644
index 11803b8d7a5e..000000000000
--- a/tests/samplers/test_no_bad_words.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Make sure bad_words works.
-
-Run `pytest tests/samplers/test_no_bad_words.py`.
-
-"""
-from typing import Optional
-
-import pytest
-from transformers import AutoTokenizer
-
-from vllm import LLM, SamplingParams
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
-def _generate(
-    llm: LLM,
-    prompt: str,
-    num_prompt_tokens: int,
-    temperature: float = 0,
-    bad_words: Optional[list[str]] = None,
-) -> list[int]:
-    sampling_params = SamplingParams(
-        temperature=temperature,
-        bad_words=bad_words,
-    )
-
-    # [([output_token_ids, ], [output_text, ]), ]
-    output = llm.generate([prompt], sampling_params=sampling_params)
-
-    output_token_ids = output[0][0][0][num_prompt_tokens:]
-    # [0] first (and only) request output
-    # [0] token_ids (not text)
-    # [0] first (and only) output completion
-
-    return output_token_ids
-
-
-class TestOneTokenBadWord:
-    MODEL = "TheBloke/Llama-2-7B-fp16"
-
-    PROMPT = "Hi! How are"
-    TARGET_TOKEN = "you"
-
-    def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
-
-        self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id = self._encode(self.TARGET_TOKEN,
-                                            add_special_tokens=False)[0]
-
-    def test_one_token_bad_word(self, vllm_runner):
-        with vllm_runner(self.MODEL) as llm:
-            output_token_ids = self._generate(llm)
-            assert output_token_ids[0] == self.target_token_id
-
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN])
-            assert self.target_token_id not in output_token_ids
-
-    def _generate(self,
-                  llm: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
-        return _generate(
-            llm=llm,
-            prompt=self.PROMPT,
-            num_prompt_tokens=self.num_prompt_tokens,
-            bad_words=bad_words,
-        )
-
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
-
-
-class TestTwoTokenBadWord:
-    # Another model (with a different tokenizer behaviour)
-    MODEL = "distilbert/distilgpt2"
-
-    PROMPT = "How old are you? I am 10"
-    TARGET_TOKEN1 = "years"
-    TARGET_TOKEN2 = "old"
-    NEIGHBOUR_TOKEN2 = "older"
-
-    def setup_method(self, method):
-        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL,
-                                                       add_prefix_space=True)
-
-        self.num_prompt_tokens = len(self._encode(self.PROMPT))
-        self.target_token_id1 = self._encode(self.TARGET_TOKEN1,
-                                             add_special_tokens=False)[0]
-        self.target_token_id2 = self._encode(self.TARGET_TOKEN2,
-                                             add_special_tokens=False)[0]
-        self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2,
-                                                add_special_tokens=False)[0]
-
-    def test_two_token_bad_word(self, vllm_runner):
-        with vllm_runner(self.MODEL, dtype="half") as llm:
-            output_token_ids = self._generate(llm)
-            assert output_token_ids[:2] == [
-                self.target_token_id1, self.target_token_id2
-            ]
-
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN1])
-            assert self.target_token_id1 not in output_token_ids
-
-            output_token_ids = self._generate(llm,
-                                              bad_words=[self.TARGET_TOKEN2])
-            assert output_token_ids[0] == self.target_token_id1
-            assert self.target_token_id2 not in output_token_ids
-
-            output_token_ids = self._generate(
-                llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}'])
-            assert output_token_ids[0] == self.target_token_id1
-            assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
-            ]
-            assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
-            # Model dependent behaviour
-            assert output_token_ids[:2] == [
-                self.target_token_id1, self.neighbour_token_id2
-            ]
-
-            output_token_ids = self._generate(
-                llm,
-                bad_words=[
-                    f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}',
-                    f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}'
-                ])
-            assert output_token_ids[0] == self.target_token_id1
-            assert output_token_ids[:2] != [
-                self.target_token_id1, self.target_token_id2
-            ]
-            assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.target_token_id2])
-            assert output_token_ids[:2] != [
-                self.target_token_id1, self.neighbour_token_id2
-            ]
-            assert not self._contains(
-                output_token_ids,
-                [self.target_token_id1, self.neighbour_token_id2])
-            assert ((self.target_token_id2 in output_token_ids)
-                    or (self.neighbour_token_id2 in output_token_ids))
-
-    def _generate(self,
-                  llm: LLM,
-                  bad_words: Optional[list[str]] = None) -> list[int]:
-        return _generate(
-            llm=llm,
-            prompt=self.PROMPT,
-            num_prompt_tokens=self.num_prompt_tokens,
-            bad_words=bad_words,
-        )
-
-    @staticmethod
-    def _contains(sequence: list[int], subsequence: list[int]) -> bool:
-        searched = False
-
-        for start in range(len(sequence)):
-            end = start + len(subsequence)
-            current_subsequence = sequence[start:end]
-
-            if len(current_subsequence) < len(subsequence):
-                continue
-
-            searched = True
-
-            assert len(current_subsequence) == len(subsequence)
-
-            if current_subsequence == subsequence:
-                return True
-
-        assert searched, "All subsequences did not match in length..."
-
-        return False
-
-    def _encode(self,
-                prompt: str,
-                add_special_tokens: bool = True) -> list[int]:
-        return self.tokenizer(prompt,
-                              add_special_tokens=add_special_tokens).input_ids
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
deleted file mode 100644
index 86fc14dc85f8..000000000000
--- a/tests/samplers/test_ranks.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm import SamplingParams
-
-MODELS = ["distilbert/distilgpt2"]
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    """We can run both engines for this test."""
-    pass
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
-def test_ranks(
-    vllm_runner,
-    model,
-    dtype,
-    example_prompts,
-):
-    max_tokens = 5
-    num_top_logprobs = 5
-    num_prompt_logprobs = 5
-
-    with vllm_runner(model, dtype=dtype,
-                     max_logprobs=num_top_logprobs) as vllm_model:
-
-        ## Test greedy logprobs ranks
-        vllm_sampling_params = SamplingParams(
-            temperature=0.0,
-            top_p=1.0,
-            max_tokens=max_tokens,
-            logprobs=num_top_logprobs,
-            prompt_logprobs=num_prompt_logprobs)
-        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                      vllm_sampling_params)
-
-        ## Test non-greedy logprobs ranks
-        sampling_params = SamplingParams(temperature=1.0,
-                                         top_p=1.0,
-                                         max_tokens=max_tokens,
-                                         logprobs=num_top_logprobs,
-                                         prompt_logprobs=num_prompt_logprobs)
-        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
-
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
-
-    for result in res:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks
-        for token, logprobs in zip(result[0], result[2]):
-            assert logprobs[token].rank >= 1
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
deleted file mode 100644
index 520b88d03ac8..000000000000
--- a/tests/samplers/test_sampler.py
+++ /dev/null
@@ -1,769 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-import random
-from dataclasses import dataclass
-from typing import Optional
-from unittest.mock import Mock, patch
-
-import pytest
-import torch
-from transformers import GenerationConfig, GenerationMixin
-
-import vllm.envs as envs
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-class MockLogitsSampler(Sampler):
-
-    def __init__(self, fake_logits: torch.Tensor):
-        super().__init__()
-        self.fake_logits = fake_logits
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-    return input_tensor, fake_logits, sampler
-
-
-VOCAB_SIZE = 32000
-RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-def _do_sample(
-    batch_size: int,
-    input_tensor: torch.Tensor,
-    sampler: MockLogitsSampler,
-    sampling_params: SamplingParams,
-    device: str,
-):
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-    expected = torch.argmax(fake_logits, dim=-1)
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == expected[i].item()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed_deterministic(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      sampling_params, device)
-
-    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       sampling_params, device)
-
-    assert first_sampler_output == second_sampler_output
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_min_tokens_penalty(seed: int, device: str):
-    seq_id_counter = Counter(start=random.randint(0, 100))
-    set_random_seed(seed)
-    torch.set_default_device(device)
-
-    def create_sampling_params(min_tokens,
-                               eos_token_id=0,
-                               *,
-                               stop_token_ids: Optional[list[int]] = None,
-                               prompt_logprobs: Optional[int] = None):
-        sampling_params = SamplingParams(
-            min_tokens=min_tokens,
-            max_tokens=9999,  # keep higher than max of min_tokens
-            stop_token_ids=stop_token_ids,
-            # requesting prompt_logprobs changes the structure of `logits`
-            prompt_logprobs=prompt_logprobs,
-        )
-        sampling_params.all_stop_token_ids.add(eos_token_id)
-        return sampling_params
-
-    def create_sequence_data(num_input=3, num_generated=0):
-        seq_data = SequenceData.from_seqs(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
-        if num_generated > 0:
-            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
-                                                       k=num_generated)
-        return seq_data
-
-    def generate_test_case():
-        # generate multiple seq groups but limit total batch size
-        batch_size = random.randint(1, 128)
-
-        expected_penalization = []
-        sequence_metadata_list: list[SequenceGroupMetadata] = []
-        # 20% chance to generate seq group metadata list with all prompts
-        is_prompt = random.random() < 0.2
-        while batch_size > 0:
-            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
-
-            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
-            min_tokens = random.randint(0, 50)
-            num_stop_tokens = random.randint(0, 8)
-            if num_stop_tokens > 0:
-                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
-                                                k=num_stop_tokens)
-            else:
-                stop_token_ids = None
-
-            sampling_params = create_sampling_params(
-                min_tokens=min_tokens,
-                eos_token_id=eos_token_id,
-                stop_token_ids=stop_token_ids)
-
-            seq_data: dict[int, SequenceData] = {}
-            seq_group_penalization: list[bool] = []
-            for _ in range(num_seqs):
-                num_input = random.randint(1, 100)
-                num_generated = 0 if is_prompt else random.randint(1, 100)
-                seq_data[next(seq_id_counter)] = create_sequence_data(
-                    num_input=num_input, num_generated=num_generated)
-                seq_group_penalization.append(num_generated < min_tokens)
-
-            expected_penalization.extend(seq_group_penalization)
-            sequence_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{batch_size}",
-                    is_prompt=is_prompt,
-                    seq_data=seq_data,
-                    sampling_params=sampling_params,
-                    block_tables={},
-                ))
-            batch_size -= num_seqs
-
-        return {
-            "expected_penalization": expected_penalization,
-            "seq_group_metadata_list": sequence_metadata_list,
-        }
-
-    # define some explicit test cases for edge case behavior
-    prompt_without_penalization = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(0),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization = {
-        "expected_penalization": [True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization_and_prompt_logprobs = {
-        "expected_penalization": [False, False, True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=3),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-        ]
-    }
-
-    stop_penalizing_after_min_tokens = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
-    prompt_combination = {
-        "expected_penalization": [False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=2),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_3",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
-    decode_combination = {
-        "expected_penalization": [True, False, False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=100),
-                },
-                sampling_params=create_sampling_params(
-                    2, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=20),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=10),
-                },
-                sampling_params=create_sampling_params(
-                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-        ]
-    }
-
-    if seed == 0:
-        test_cases = [
-            prompt_without_penalization,
-            prompt_with_penalization,
-            prompt_with_penalization_and_prompt_logprobs,
-            stop_penalizing_after_min_tokens,
-            prompt_combination,
-            decode_combination,
-        ]
-    else:
-        test_cases = [generate_test_case()]
-
-    def run_test_case(*, expected_penalization: list[bool],
-                      seq_group_metadata_list: list[SequenceGroupMetadata]):
-        assert expected_penalization, \
-            "Invalid test case, need expected_penalization"
-        assert seq_group_metadata_list, \
-            "Invalid test case, need seq_group_metadata_list"
-
-        batch_size = 0
-        seq_lens: list[int] = []
-        sampling_params_per_row: list[SamplingParams] = []
-        for sgm in seq_group_metadata_list:
-            sampling_params = sgm.sampling_params
-
-            num_rows = len(sgm.seq_data)
-            if sgm.is_prompt:
-                # a prompt seq_group has only one sequence
-                seq_data = next(iter(sgm.seq_data.values()))
-                prompt_len = seq_data.get_prompt_len()
-                seq_lens.append(prompt_len)
-
-                assert sgm.sampling_params is not None
-                if sgm.sampling_params.prompt_logprobs:
-                    # with prompt_logprobs each token in the prompt has a row in
-                    # logits
-                    num_rows = prompt_len
-
-            batch_size += num_rows
-            sampling_params_per_row.extend(
-                itertools.repeat(sampling_params, num_rows))
-
-        assert len(
-            expected_penalization
-        ) == batch_size, \
-            ("Invalid test case, expected_penalization does not match computed"
-             "batch size")
-
-        _, fake_logits, sampler = _prepare_test(batch_size)
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens=seq_lens if seq_lens else None,
-            query_lens=seq_lens if seq_lens else [1] * batch_size,
-            device=device,
-            pin_memory=is_pin_memory_available())
-        # the logits tensor is modified in-place by the sampler
-        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-        for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_row)):
-
-            tokens_to_check = sampling_params.all_stop_token_ids
-
-            if should_penalize:
-                for token_id in tokens_to_check:
-                    assert fake_logits[logits_idx, token_id] == -float(
-                        'inf'
-                    ), f"Expected token {token_id} for logits row {logits_idx}"
-                    " to be penalized"
-                # no other tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] == -float('inf')) == len(
-                        tokens_to_check
-                    ), f"Expected only {len(tokens_to_check)} to be penalized"
-            else:
-                # no tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] ==
-                    -float('inf')) == 0, "No tokens should have been penalized"
-
-    for test_case in test_cases:
-        run_test_case(**test_case)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_mixed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    expected_tokens: list[Optional[list[int]]] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        expected: Optional[list[int]] = None
-        sampling_type = random.randint(0, 2)
-        if sampling_type == 0:
-            sampling_params = SamplingParams(temperature=0)
-            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
-        elif sampling_type in (1, 2):
-            n = random.randint(1, 10)
-            sampling_params = SamplingParams(
-                temperature=random.random() + 0.1,
-                top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10),
-                n=n,
-                presence_penalty=random.randint(0, 1),
-            )
-            if sampling_type == 2:
-                sampling_params.seed = random.randint(0, 10000)
-            else:
-                for idx in range(n):
-                    fake_logits[i, i + idx] = 1e2
-                expected = list(range(i, i + n))
-
-        expected_tokens.append(expected)
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    generators: dict[str, torch.Generator] = {}
-
-    def test_sampling():
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available(),
-            generators=generators)
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        for i, (sequence_output, metadata) in enumerate(
-                zip(sampler_output, seq_group_metadata_list)):
-            assert metadata.sampling_params is not None
-
-            if (metadata.sampling_params.seed is not None
-                    and expected_tokens[i] is None):
-                # Record seeded random result to compare with results of
-                # second invocation
-                expected_tokens[i] = [
-                    nth_output.output_token
-                    for nth_output in sequence_output.samples
-                ]
-                continue
-
-            expected_tokens_item = expected_tokens[i]
-            assert expected_tokens_item is not None
-
-            for n, nth_output in enumerate(sequence_output.samples):
-                assert metadata.sampling_params is not None
-
-                if (metadata.sampling_params.temperature == 0
-                        or metadata.sampling_params.seed is not None):
-                    # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens_item[n]
-                else:
-                    # For non-seeded random check that one of the high-logit
-                    # tokens were chosen
-                    assert nth_output.output_token in expected_tokens_item
-
-    # Test batch
-    test_sampling()
-
-    # Shuffle the batch and resample
-    target_index = list(range(batch_size))
-    for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, seq_lens):
-        random.Random(seed).shuffle(list_to_shuffle)
-    target_index = torch.tensor(target_index)
-    input_tensor.data = input_tensor.index_select(0, target_index)
-    fake_logits.data = fake_logits.index_select(0, target_index)
-
-    # This time, results of seeded random samples will be compared with
-    # the corresponding sample in the pre-shuffled batch
-    test_sampling()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_top_k_top_p(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    top_k = random.randint(100, 500)
-    top_p = random.random() * 0.1
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device=device,
-                              dtype=torch.float16)
-    fake_logits = torch.normal(0,
-                               5,
-                               size=(batch_size, vocab_size),
-                               device=input_tensor.device,
-                               dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-
-    generation_model = GenerationMixin()
-    generation_config = GenerationConfig(top_k=top_k,
-                                         top_p=top_p,
-                                         do_sample=True)
-
-    @dataclass
-    class MockConfig:
-        is_encoder_decoder: bool = False
-
-    generation_model.config = MockConfig()  # needed by the following method
-    generation_model._prepare_special_tokens(generation_config, device=device)
-    processors = generation_model._get_logits_processor(generation_config,
-                                                        None,
-                                                        None,
-                                                        None, [],
-                                                        device=device)
-    assert len(processors) == 2  # top_p and top_k
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1,
-                    top_k=top_k,
-                    top_p=top_p,
-                ),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-
-    sample_probs = None
-
-    def mock_sample(probs, *args, **kwargs):
-        nonlocal sample_probs
-        sample_probs = probs
-        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
-                 for prob in probs], None)
-
-    # top-k and top-p is only calculated when flashinfer kernel is not available
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler."
-               "flashinfer_top_k_top_p_sampling", None):
-        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-    assert sample_probs is not None
-
-    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
-    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
-    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
-    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_flashinfer_fallback(seed: int, device: str):
-    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
-        pytest.skip("Flashinfer sampler is disabled")
-
-    pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
-
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    def failing_flashinfer_sampling(*_args, **_kwargs):
-        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    with patch(
-            "vllm.model_executor.layers.sampler."
-            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
-        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                             sampling_params, device)
-
-    assert sampler_output == fallback_sampler_output
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_repetition_penalty_mixed(device: str):
-
-    vocab_size = 8
-
-    def test_sampling_params(sampling_params: list[SamplingParams]):
-
-        seq_group_metadata_list: list[SequenceGroupMetadata] = []
-        seq_lens: list[int] = []
-        for i in range(2):
-            seq_group_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{i}",
-                    is_prompt=True,
-                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                    sampling_params=sampling_params[i],
-                    block_tables={0: [1]},
-                ))
-            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available())
-
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 device=device,
-                                 dtype=torch.float16)
-
-        fake_logits[:, 5] = 1.1e-2
-        fake_logits[:, 1] = 1.2e-2
-
-        sampler = MockLogitsSampler(fake_logits)
-
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        generated_tokens = []
-        for output in sampler_output:
-            generated_tokens.append(output.samples[0].output_token)
-
-        return generated_tokens
-
-    # one configuration is greedy with repetition_penalty
-    sampling_params_rep = SamplingParams(
-        temperature=0.0,
-        repetition_penalty=2.0,
-    )
-
-    # other configuration is sampling w/o repetition_penalty
-    sampling_params_sample = SamplingParams(
-        temperature=1.0,
-        top_k=1,
-        seed=42,
-    )
-
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
-
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
-
-    assert tokens1[0] == tokens2[1]
-    assert tokens1[1] == tokens2[0]
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_include_gpu_probs_tensor(device: str):
-    set_random_seed(42)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-    sampler.include_gpu_probs_tensor = True
-    sampler.should_modify_greedy_probs_inplace = False
-
-    sampling_params = SamplingParams(temperature=0)
-
-    mock_inplace = Mock()
-    with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
-        mock_inplace.assert_not_called()
-
-    assert sampler_output.sampled_token_probs is not None
-    assert sampler_output.logprobs is not None
-    assert sampler_output.sampled_token_ids is not None
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
deleted file mode 100644
index 5a0efd98acc1..000000000000
--- a/tests/samplers/test_seeded_generate.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Verify that seeded random sampling is deterministic.
-
-Run `pytest tests/samplers/test_seeded_generate.py`.
-"""
-import copy
-import random
-from itertools import combinations
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.model_executor.utils import set_random_seed
-
-MODEL = "facebook/opt-125m"
-RANDOM_SEEDS = list(range(5))
-
-
-@pytest.fixture
-def vllm_model(vllm_runner, monkeypatch):
-    # This file relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(MODEL, dtype="half") as vllm_model:
-        yield vllm_model
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_random_sample_with_seed(
-    vllm_model,
-    example_prompts,
-    seed: int,
-) -> None:
-    set_random_seed(seed)
-
-    sampling_params = SamplingParams(
-        # Parameters to ensure sufficient randomness
-        temperature=3.0,
-        top_p=min(random.random() + 0.3, 1),
-        top_k=random.randint(5, 20),
-        n=random.randint(1, 10),
-        presence_penalty=random.randint(0, 1),
-        max_tokens=8,
-        ignore_eos=True,
-    )
-
-    sampling_params_seed_1 = copy.deepcopy(sampling_params)
-    sampling_params_seed_1.seed = 100
-    sampling_params_seed_2 = copy.deepcopy(sampling_params)
-    sampling_params_seed_2.seed = 200
-
-    llm = vllm_model.llm
-
-    for prompt in example_prompts:
-        for params in (
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-        ):
-            llm._add_request(prompt, params=params)
-
-    results = llm._run_engine(use_tqdm=False)
-    all_outputs = [[out.token_ids for out in output.outputs]
-                   for output in results]
-
-    for i in range(0, len(example_prompts), 6):
-        outputs = all_outputs[i:i + 6]
-
-        # verify all non-seeded requests differ
-        for output_a, output_b in combinations(
-            (outputs[0], outputs[1], outputs[2], outputs[3]),
-                2,
-        ):
-            assert output_a != output_b
-
-        # verify requests with the same seed match
-        assert outputs[1] == outputs[4]
-        assert outputs[2] == outputs[5]
-
-        # verify generations within the same parallel sampling group differ
-        for output in outputs:
-            for sub_output_a, sub_output_b in combinations(output, 2):
-                assert sub_output_a != sub_output_b
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
deleted file mode 100644
index edc0849dff33..000000000000
--- a/tests/test_cache_block_hashing.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test hashing of cache blocks.
-
-Run `pytest tests/test_cache_block_hashing.py`.
-"""
-from typing import Optional
-
-import pytest
-
-from vllm.inputs import token_inputs
-from vllm.lora.request import LoRARequest
-from vllm.sequence import Sequence
-from vllm.transformers_utils.tokenizer_group import TokenizerGroup
-
-# Make two prefixes with different first blocks.
-prefix_start = [("You are an expert"), ("You are a")]
-prefix_common = (
-    " school principal, skilled in effectively managing "
-    "faculty and staff. Draft 10-15 questions for a potential first grade "
-    "Head Teacher for my K-12, all-girls', independent school that emphasizes "
-    "community, joyful discovery, and life-long learning. The candidate is "
-    "coming in for a first-round panel interview for a 8th grade Math "
-    "teaching role. They have 5 years of previous teaching experience "
-    "as an assistant teacher at a co-ed, public school with experience "
-    "in middle school math teaching. Based on this, fulfill "
-    "the following: ")
-prefixes = [start + prefix_common for start in prefix_start]
-
-# Sample prompts.
-sample_prompts = [
-    "Hello, my name is", "The president of the United States is",
-    "The capital of France is", "The future of AI is"
-]
-
-
-# Helper function.
-def flatten_2d(li):
-    return [lss for ls in li for lss in ls]
-
-
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("block_size", [16])
-@pytest.mark.parametrize("max_num_seqs", [256])
-@pytest.mark.parametrize("concurrent_lora_int_ids",
-                         [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
-def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
-                             concurrent_lora_int_ids: list[Optional[int]]):
-
-    tokenizer = TokenizerGroup(
-        tokenizer_id="facebook/opt-125m",
-        enable_lora=False,
-        max_num_seqs=max_num_seqs,
-        max_input_length=None,
-    )
-
-    hashes: list[list[list[int]]] = []
-
-    for prefix in prefixes:
-        for lora_int_id in concurrent_lora_int_ids:
-            lora_request = None
-
-            if lora_int_id is not None:
-                lora_request = LoRARequest(
-                    f"example_lora_{lora_int_id}",
-                    lora_int_id,
-                    f"example/path/to/lora_{lora_int_id}",
-                )
-
-            hashes.append([])
-            prompts = [prefix + prompt for prompt in sample_prompts]
-            for seq_id, prompt in enumerate(prompts):
-                hashes[-1].append([])
-                prompt_token_ids = tokenizer.encode(prompt)
-                seq = Sequence(seq_id,
-                               inputs=token_inputs(prompt_token_ids,
-                                                   prompt=prompt),
-                               block_size=block_size,
-                               eos_token_id=tokenizer.tokenizer.eos_token_id,
-                               lora_request=lora_request)
-
-                num_blocks = len(prompt_token_ids) // block_size
-                for idx in range(num_blocks):
-                    hashes[-1][-1].append(seq.hash_of_block(idx))
-
-    # Check that hashes made with two prefixes with different first blocks are
-    # different everywhere.
-    for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])):
-        assert (hash0 != hash1)
-
-    # Check that hashes of different prompts made with the same prefix are the
-    # same until the hashes that contain the prompt.
-    for hash_pref in hashes:
-        same_hashes = [tuple(h[:-1]) for h in hash_pref]
-        different_hashes = [h[-1] for h in hash_pref]
-        assert (len(set(same_hashes)) == 1)
-        assert (len(set(different_hashes)) == len(different_hashes))
diff --git a/tests/test_sequence.py b/tests/test_sequence.py
deleted file mode 100644
index c734c8514a6d..000000000000
--- a/tests/test_sequence.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceData,
-                           SequenceOutput)
-
-from .core.utils import create_dummy_prompt
-
-
-@pytest.fixture
-def sample_outputs():
-    return [
-        CompletionSequenceGroupOutput(samples=[
-            SequenceOutput(parent_seq_id=0, output_token=i, logprobs={})
-        ],
-                                      prompt_logprobs=None) for i in range(5)
-    ]
-
-
-@pytest.fixture
-def sampler_output(sample_outputs):
-    return SamplerOutput(outputs=sample_outputs)
-
-
-def test_sampler_output_initialization(sampler_output, sample_outputs):
-    assert len(sampler_output) == len(sample_outputs)
-    assert sampler_output.sampled_token_probs is None
-    assert sampler_output.sampled_token_ids is None
-
-
-def test_sampler_output_getitem(sampler_output, sample_outputs):
-    assert sampler_output[2] == sample_outputs[2]
-
-
-def test_sampler_output_setitem(sampler_output):
-    new_output = CompletionSequenceGroupOutput(samples=[
-        SequenceOutput(parent_seq_id=0, output_token=99, logprobs={})
-    ],
-                                               prompt_logprobs=None)
-    sampler_output[2] = new_output
-    assert sampler_output[2] == new_output
-
-
-def test_sampler_output_len(sampler_output, sample_outputs):
-    assert len(sampler_output) == len(sample_outputs)
-
-
-def test_sampler_output_eq(sample_outputs):
-    sampler_output1 = SamplerOutput(outputs=sample_outputs)
-    sampler_output2 = SamplerOutput(outputs=sample_outputs.copy())
-    sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1])
-    assert sampler_output1 == sampler_output2
-    assert sampler_output1 != sampler_output3
-
-
-def test_sequence_data_prefill():
-    seq_data = SequenceData.from_seqs([1, 2, 3, 4])
-    assert seq_data.get_num_uncomputed_tokens() == 4
-    assert seq_data.get_num_computed_tokens() == 0
-    # advance by 2
-    seq_data.update_num_computed_tokens(2)
-    assert seq_data.get_num_uncomputed_tokens() == 2
-    assert seq_data.get_num_computed_tokens() == 2
-
-    # advance by 1
-    seq_data.update_num_computed_tokens(1)
-    assert seq_data.get_num_uncomputed_tokens() == 1
-    assert seq_data.get_num_computed_tokens() == 3
-
-    # append tokens and reset, simulating recompute
-    seq_data.append_token_id(1, logprob=0.0)
-    seq_data.reset_state_for_recompute()
-    assert seq_data.get_num_uncomputed_tokens() == 5
-    assert seq_data.get_num_computed_tokens() == 0
-
-
-def test_sequence_group_stage():
-    _, seq_group = create_dummy_prompt("1", 12)
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(6)
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(5)
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(1)
-    assert seq_group.is_prefill() is False
-    seqs = seq_group.get_seqs()
-    assert len(seqs) == 1
-    seqs[0].data.append_token_id(1, logprob=0.0)
-    for seq in seq_group.get_seqs():
-        seq.reset_state_for_recompute()
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(5)
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(7)
-    assert seq_group.is_prefill() is True
-    seq_group.update_num_computed_tokens(1)
-    assert seq_group.is_prefill() is False
diff --git a/tests/worker/__init__.py b/tests/worker/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
deleted file mode 100644
index 3f202d4dbe94..000000000000
--- a/tests/worker/conftest.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import pytest
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This module tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
deleted file mode 100644
index 35ac90b38e84..000000000000
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ /dev/null
@@ -1,648 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-
-import pytest
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.platforms import current_platform
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import make_tensor_with_pad
-from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-
-BATCH_SIZES = [1, 4, 16, 64, 256]
-
-
-def _create_model_runner(model: str, *args,
-                         **kwargs) -> EncoderDecoderModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    model_runner = EncoderDecoderModelRunner(
-        vllm_config=engine_config,
-        is_driver_worker=True,
-    )
-    return model_runner
-
-
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
-def test_empty_seq_group():
-    """Verify prepare prompt and decode returns empty output
-       for empty seq group list"""
-
-    model_runner = _create_model_runner(
-        "facebook/bart-base",
-        seed=0,
-        dtype="float16",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enforce_eager=True,
-    )
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    (
-        input_tokens,
-        input_positions,
-        encoder_input_tokens,
-        encoder_input_positions,
-        attn_metadata,
-        return_seq_lens,
-    ) = (
-        model_input.input_tokens,
-        model_input.input_positions,
-        model_input.encoder_input_tokens,
-        model_input.encoder_input_positions,
-        model_input.attn_metadata,
-        model_input.seq_lens,
-    )
-    assert input_tokens is None
-    assert input_positions is None
-    assert encoder_input_tokens is None
-    assert encoder_input_positions is None
-    assert attn_metadata is None
-    assert return_seq_lens is None
-
-
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-def test_prepare_prompt(batch_size):
-    '''
-    Test the ability of the encoder/decoder model runner subclass to
-    produce prefill-phase model inputs & attention metadata.
-
-    Test behavior:
-
-    * Instantiate BART base model & enc/dec model runner
-    * Construct sequence-group metadata for dummy prompts
-    * Test that encoder attention, decoder self-attention,
-      and encoder/decoder cross-attention inputs are correct
-
-    Arguments:
-
-    * batch_size
-    * backend_name: The attention backend under test
-    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
-    '''
-
-    model_runner = _create_model_runner(
-        "facebook/bart-base",
-        seed=0,
-        dtype="float16",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enforce_eager=True,
-    )
-
-    seq_lens: list[int] = []
-    encoder_seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    cross_block_table = [2]
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        seq_data = SequenceData.from_seqs(range(seq_len))
-        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
-        encoder_seq_lens.append(encoder_seq_len)
-        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-            encoder_seq_data=encoder_seq_data,
-            cross_block_table=cross_block_table,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    # Build
-    # * Decoder model inputs
-    # * Decoder self-attention KV caching data structures
-    # * Encoder model inputs
-    # * Encoder/decoder cross-attention KV caching data structures
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    encoder_input_tokens = model_input.encoder_input_tokens
-    encoder_input_positions = model_input.encoder_input_positions
-    cross_slot_mapping = attn_metadata.cross_slot_mapping
-    assert return_seq_lens == seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-    assert len(cross_slot_mapping) == len(encoder_input_tokens)
-
-    # Verify input metadata is correct for prompts.
-    # - Decoder attention metadata
-    device = model_runner.device
-    assert attn_metadata.num_prefills > 0
-    assert attn_metadata.num_decode_tokens == 0
-    assert torch.equal(attn_metadata.seq_lens_tensor,
-                       torch.tensor(seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
-    assert attn_metadata.max_decode_seq_len == 0
-    # - Encoder attention metadata
-    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
-    assert torch.equal(
-        attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
-    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
-
-    # Test decoder subquery start locs.
-    start_idx = 0
-    start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        start_loc.append(start_idx)
-    assert torch.equal(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device),
-    )
-
-    # Test decoder seq start locs & context lengths
-
-    assert torch.equal(
-        attn_metadata.seq_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device),
-    )
-    assert torch.equal(
-        attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device),
-    )
-
-    # Verify block tables are correct for prompts
-    # - Decoder self-attention
-    expected = torch.tensor(
-        [[] for _ in range(len(seq_group_metadata_list))],
-        dtype=torch.int32,
-        device=model_runner.device,
-    )
-    assert torch.equal(
-        attn_metadata.block_tables,
-        expected,
-    )
-    # - Encoder/decoder cross-attention
-    assert torch.equal(
-        attn_metadata.cross_block_tables,
-        expected,
-    )
-
-    # Cuda graph should not be used for prefill.
-    assert attn_metadata.use_cuda_graph is False
-
-    # Verify the lengths of input tokens & positions
-    # - Decoder
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    # -- An indirect check that model_input.input_tokens
-    #    and model_input.input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        input_tokens,
-        input_positions,
-    )
-    # - Encoder
-    assert len(encoder_input_tokens) == sum(encoder_seq_lens)
-    # -- An indirect check that model_input.encoder_input_tokens
-    #    and model_input.encoder_input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        encoder_input_tokens,
-        encoder_input_positions,
-    )
-
-    # Test that vLLM sampling infrastructure chooses the correct
-    # sequence positions at which to sample (i.e. the end of
-    # each sequence) in the prefill phase
-
-    expected_selected_token_indices = []
-    selected_token_start_idx = 0
-    for seq_len in seq_lens:
-        # Compute the index offset of the final token in each
-        # prompt (recall that the prompts are concatenated)
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
-        selected_token_start_idx += seq_len
-
-    sampling_metadata = model_input.sampling_metadata
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(
-        expected_selected_token_indices,
-        device=actual.device,
-        dtype=actual.dtype,
-    )
-    assert torch.equal(actual, expected)
-
-
-@pytest.mark.skipif(condition=current_platform.is_cpu(),
-                    reason="CPU backend is currently "
-                    "unsupported for encoder/ "
-                    "decoder models")
-@pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
-def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
-    '''
-    Test the ability of the encoder/decoder model runner subclass to
-    produce decode-phase model inputs & attention metadata.
-
-    Test behavior:
-
-    * Instantiate BART base model & enc/dec model runner
-    * Construct sequence-group metadata for dummy prompts
-    * Test that encoder attention, decoder self-attention,
-      and encoder/decoder cross-attention inputs are correct
-
-    Arguments:
-
-    * batch_size
-    * multiple_seqs_per_seq_group
-    * backend_name: The attention backend under test
-    * enforce_eager: Enforce eager mode if True (i.e. no CUDAGraph)
-    '''
-
-    model_runner = _create_model_runner(
-        "facebook/bart-base",
-        seed=0,
-        dtype="float16",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enforce_eager=True,
-    )
-
-    seq_lens: list[int] = []
-    encoder_seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {
-        0: [1],
-        1: [3]
-    } if multiple_seqs_per_seq_group else {
-        0: [1]
-    }
-    cross_block_table = [2]
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_data = SequenceData.from_seqs(range(seq_len))
-        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
-        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
-
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={
-                0: seq_data,
-                1: seq_data
-            } if multiple_seqs_per_seq_group else {0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-            encoder_seq_data=encoder_seq_data,
-            cross_block_table=cross_block_table,
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-        seq_lens.extend(
-            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
-        encoder_seq_lens.extend(
-            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
-
-    # Build
-    # * Decoder model inputs
-    # * Decoder self-attention KV caching data structures
-    # * Encoder model inputs
-    # * Encoder/decoder cross-attention KV caching data structures
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    encoder_input_tokens = model_input.encoder_input_tokens
-    encoder_input_positions = model_input.encoder_input_positions
-    cross_slot_mapping = attn_metadata.cross_slot_mapping
-    assert return_seq_lens == seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-    assert len(cross_slot_mapping) == len(encoder_input_tokens)
-
-    # Verify input metadata is correct for decode phase.
-    # - Decoder attention metadata
-    device = model_runner.device
-    assert attn_metadata.num_prefills == 0
-    assert attn_metadata.num_decode_tokens > 0
-    assert torch.equal(attn_metadata.seq_lens_tensor,
-                       torch.tensor(seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_prefill_seq_len == 0
-    assert attn_metadata.max_decode_seq_len == max(seq_lens)
-    # - Encoder attention metadata
-    assert attn_metadata.encoder_seq_lens == encoder_seq_lens
-    assert torch.equal(
-        attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(encoder_seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.max_encoder_seq_len == max(encoder_seq_lens)
-    assert attn_metadata.num_encoder_tokens == sum(encoder_seq_lens)
-
-    # Test decoder subquery start locs.
-    start_idx = 0
-    start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += 1
-        start_loc.append(start_idx)
-    assert torch.equal(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device),
-    )
-
-    # Test decoder seq start locs. Note that for normal prefill it is
-    # equivalent to query_start_loc.
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-
-    # Test seq_start_loc and context lengths
-
-    assert torch.equal(
-        attn_metadata.seq_start_loc,
-        torch.tensor(seq_start_loc, dtype=torch.int32, device=device),
-    )
-    assert torch.equal(
-        attn_metadata.context_lens_tensor,
-        torch.tensor([seq_len - 1 for seq_len in seq_lens],
-                     dtype=torch.int,
-                     device=device))
-
-    # Verify block tables are correct for prompts
-    # - Decoder self-attention
-    flattened_block_tables = [
-        block_table for block_table in block_tables.values()
-    ]
-    expected = torch.tensor(flattened_block_tables *
-                            len(seq_group_metadata_list),
-                            dtype=torch.int32,
-                            device=model_runner.device)
-    assert torch.equal(
-        attn_metadata.block_tables,
-        expected,
-    )
-    # - Encoder/decoder cross-attention
-    expected = torch.tensor([
-        cross_block_table for seq_group_metadata in seq_group_metadata_list
-        for _ in range(len(seq_group_metadata.seq_data))
-    ],
-                            dtype=torch.int32,
-                            device=model_runner.device)
-    assert torch.equal(
-        attn_metadata.cross_block_tables,
-        expected,
-    )
-
-    # Model runner's CUDAGraph setting should be propagated to attention
-    # metadata.
-    assert attn_metadata.use_cuda_graph is False
-
-    # Verify the lengths of input tokens & positions
-    # - Decoder
-    assert len(input_tokens) == len(seq_lens)
-    assert len(input_positions) == len(seq_lens)
-    # -- An indirect check that model_input.input_tokens
-    #    and model_input.input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        input_tokens,
-        input_positions,
-    )
-    # - Encoder
-    assert len(encoder_input_tokens) == 0
-    assert len(encoder_input_tokens) == 0
-    # -- An indirect check that model_input.encoder_input_tokens
-    #    and model_input.encoder_input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        encoder_input_tokens,
-        encoder_input_positions,
-    )
-
-    # Test that vLLM sampling infrastructure chooses the correct
-    # sequence positions at which to sample (i.e. the end of
-    # each sequence) in the decode phase
-
-    expected_selected_token_indices = []
-    for selected_token_start_idx, seq_len in enumerate(seq_lens):
-        # Compute the index offset of the final token in each
-        # sequence's decoded outputs; since a single token is
-        # decoded per iteration per sequence, then the length
-        # of the decoded tokens for a given sequence is 1 and
-        # the final index offset into a given sequence's
-        # generated tokens is 0 (i.e. the expected sampling index
-        # for a given sequence is just `selected_token_start_idx`)
-        expected_selected_token_indices.append(selected_token_start_idx)
-
-    sampling_metadata = model_input.sampling_metadata
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(
-        expected_selected_token_indices,
-        device=actual.device,
-        dtype=actual.dtype,
-    )
-    assert torch.equal(actual, expected)
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257)))
-@pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
-def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
-    """
-    Tests that for encoder-decoder models with CUDA Graph capture and replay
-    enabled, the tensors used during the decode phase are correctly padded
-    for varying input batch sizes.
-    """
-    model_runner = _create_model_runner(
-        "facebook/bart-base",
-        seed=0,
-        dtype="float16",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enforce_eager=False,
-    )
-    block_tables = {
-        0: [1],
-        1: [3]
-    } if multiple_seqs_per_seq_group else {
-        0: [1]
-    }
-    seq_lens: list[int] = []
-    encoder_seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-
-    cross_block_table = [2]
-    expanded_batch_size = 0
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_data = SequenceData.from_seqs(range(seq_len))
-        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
-        encoder_seq_data = SequenceData.from_seqs(range(encoder_seq_len))
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={
-                0: seq_data,
-                1: seq_data
-            } if multiple_seqs_per_seq_group else {0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-            encoder_seq_data=encoder_seq_data,
-            cross_block_table=cross_block_table,
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_lens.extend(
-            [seq_len for _ in range(len(seq_group_metadata.seq_data))])
-        encoder_seq_lens.extend(
-            [encoder_seq_len for _ in range(len(seq_group_metadata.seq_data))])
-        expanded_batch_size = expanded_batch_size + len(
-            seq_group_metadata.seq_data)
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    encoder_input_tokens = model_input.encoder_input_tokens
-    encoder_input_positions = model_input.encoder_input_positions
-    cross_slot_mapping = attn_metadata.cross_slot_mapping
-
-    # With CUDA Graph capture and replay enabled, the decoder and encoder
-    # input sequences will be padded. Create the expected padded tensors
-    # accordingly.
-    graph_batch_size = model_runner.vllm_config.pad_for_cudagraph(
-        expanded_batch_size)
-    cuda_graph_pad_size = graph_batch_size - expanded_batch_size
-    padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
-    padded_encoder_seq_lens = encoder_seq_lens + list(
-        itertools.repeat(1, cuda_graph_pad_size))
-
-    assert return_seq_lens == padded_seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-    assert len(cross_slot_mapping) == len(encoder_input_tokens)
-
-    # Verify attention metadata
-    device = model_runner.device
-    assert attn_metadata.num_prefills == 0
-    assert attn_metadata.num_decode_tokens > 0
-    assert torch.equal(
-        attn_metadata.seq_lens_tensor,
-        torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == padded_seq_lens
-    assert attn_metadata.max_prefill_seq_len == 0
-    assert attn_metadata.max_decode_seq_len == max(seq_lens)
-    # - Encoder attention metadata
-    assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
-    assert torch.equal(
-        attn_metadata.encoder_seq_lens_tensor,
-        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
-    assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
-
-    # Verify block tables are correct for prompts
-    # - Decoder self-attention. Pad the block tables as expected.
-    flattened_block_tables = [
-        block_table for _ in range(len(seq_group_metadata_list))
-        for block_table in block_tables.values()
-    ]
-    flattened_block_tables.extend([[] for _ in range(cuda_graph_pad_size)])
-    expected = make_tensor_with_pad(
-        flattened_block_tables,
-        max_len=64,
-        pad=0,
-        dtype=torch.int32,
-        device=model_runner.device,
-    )
-    assert torch.equal(
-        attn_metadata.block_tables,
-        expected,
-    )
-    # - Encoder/decoder cross-attention. Pad the cross-attention block tables
-    # as expected.
-    expected = [
-        cross_block_table for seq_group_metadata in seq_group_metadata_list
-        for _ in range(len(seq_group_metadata.seq_data))
-    ]
-    expected.extend([[] for _ in range(cuda_graph_pad_size)])
-    expected = make_tensor_with_pad(
-        expected,
-        max_len=64,
-        pad=0,
-        dtype=torch.int32,
-        device=model_runner.device,
-    )
-    assert torch.equal(
-        attn_metadata.cross_block_tables,
-        expected,
-    )
-
-    # Model runner's CUDAGraph setting should be propagated to attention
-    # metadata.
-    assert attn_metadata.use_cuda_graph is True
-
-    # Verify the lengths of input tokens & positions
-    # - Decoder
-    assert len(input_tokens) == len(padded_seq_lens)
-    assert len(input_positions) == len(padded_seq_lens)
-    # -- An indirect check that model_input.input_tokens
-    #    and model_input.input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        input_tokens,
-        input_positions,
-    )
-    # - Encoder
-    assert len(encoder_input_tokens) == 0
-    assert len(encoder_input_tokens) == 0
-    # -- An indirect check that model_input.encoder_input_tokens
-    #    and model_input.encoder_input_positions are correct -
-    #    by design of the test, the input tokens are
-    #    equal to the input position values, so if
-    #    the model_input data structure has the correct
-    #    values then these two should be equal
-    assert torch.equal(
-        encoder_input_tokens,
-        encoder_input_positions,
-    )
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
deleted file mode 100644
index ec33d334ab65..000000000000
--- a/tests/worker/test_model_input.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-
-import torch
-
-from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
-from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-from vllm.worker.multi_step_model_runner import StatefulModelInput
-from vllm.worker.pooling_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
-
-
-class MockAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        raise NotImplementedError
-
-    @staticmethod
-    def get_impl_cls():
-        raise NotImplementedError
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return AttentionMetadata
-
-    @staticmethod
-    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
-        return AttentionMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> tuple[int, ...]:
-        raise NotImplementedError
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        pass
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        pass
-
-
-def test_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithSamplingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithSamplingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_model_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_model_input.sampling_metadata.seq_groups is None
-
-
-def test_embedding_model_runner_input():
-    pooling_metadata = PoolingMetadata(
-        seq_groups=[[0]],
-        seq_data={},
-        prompt_lens=[1],
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithPoolingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        pooling_metadata=pooling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithPoolingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # Pooling metadata is not broadcast.
-    assert received_model_input.pooling_metadata is None
-
-
-def test_multi_step_model_runner_input():
-    sampling_metadata = SamplingMetadata(
-        ["seq_group"],
-        "selected_token_indices",
-        "categorized_sample_indices",
-        "num_prompts",
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        sampling_metadata=sampling_metadata,
-        attn_metadata=attn_metadata)
-
-    model_input = StatefulModelInput(
-        frozen_model_input=frozen_model_input,
-        is_last_step=True,
-        is_first_multi_step=False,
-        current_step=4,
-        last_sampled_token_ids=torch.ones((10, 1)),
-        is_multi_step=True,
-        num_queries=8,
-        num_seqs=5,
-        cached_outputs=[],
-    )
-
-    assert isinstance(model_input, StatefulModelInput)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-        tensor_dict, attn_backend=attn_backend))
-
-    received_frozen_input = received_model_input.frozen_model_input
-
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input, StatefulModelInput)
-    assert received_frozen_input.input_tokens is not None
-    assert (received_frozen_input.input_tokens ==
-            frozen_model_input.input_tokens).all()
-    assert received_frozen_input.input_positions is not None
-    assert (received_frozen_input.input_positions ==
-            frozen_model_input.input_positions).all()
-    assert received_frozen_input.multi_modal_kwargs is None
-    assert (frozen_model_input.multi_modal_kwargs ==
-            frozen_model_input.multi_modal_kwargs)
-    assert received_frozen_input.lora_requests is None
-    assert (received_frozen_input.lora_requests ==
-            frozen_model_input.lora_requests)
-    assert received_frozen_input.lora_mapping is None
-    assert (
-        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_frozen_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # For sampling metadata, only selected_token_indices is copied.
-    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
-            sampling_metadata.selected_token_indices)
-    assert received_frozen_input.sampling_metadata.seq_groups is None
-
-    # check non frozen fields
-    assert received_model_input.is_last_step == model_input.is_last_step
-    assert (received_model_input.is_first_multi_step ==
-            model_input.is_first_multi_step)
-    assert received_model_input.current_step == model_input.current_step
-    assert (received_model_input.last_sampled_token_ids ==
-            model_input.last_sampled_token_ids).all()
-    assert received_model_input.is_multi_step == model_input.is_multi_step
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
deleted file mode 100644
index 0be25aa2fc35..000000000000
--- a/tests/worker/test_model_runner.py
+++ /dev/null
@@ -1,462 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             init_distributed_environment)
-from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import get_open_port
-from vllm.worker.model_runner import ModelRunner
-
-
-def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    model_runner = ModelRunner(
-        vllm_config=engine_config,
-        is_driver_worker=True,
-    )
-    return model_runner
-
-
-def test_deepseek_mla_attn_backend_module():
-    model_runner = _create_model_runner(
-        "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
-        trust_remote_code=True,
-        enable_chunked_prefill=False,
-    )
-    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_prompt(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    expected_input_embeds_len = 0
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(prompt_token_ids=range(seq_len))
-
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    expected_selected_token_indices = []
-    selected_token_start_idx = 0
-    for seq_len in seq_lens:
-        expected_selected_token_indices.append(selected_token_start_idx +
-                                               seq_len - 1)
-        selected_token_start_idx += seq_len
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-    slot_mapping = attn_metadata.slot_mapping
-    assert return_seq_lens == seq_lens
-    assert len(slot_mapping) == len(input_tokens)
-
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills > 0
-    assert attn_metadata.num_decode_tokens == 0
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor,
-        torch.tensor(seq_lens, device=device, dtype=torch.int))
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.max_prefill_seq_len == max(seq_lens)
-    assert attn_metadata.max_decode_seq_len == 0
-
-    # Test subquery start locs.
-    start_idx = 0
-    start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    # Test seq start locs. Note that for normal prefill it is
-    # equivalent to query_start_loc.
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.zeros(attn_metadata.context_lens_tensor.shape[0],
-                    dtype=torch.int,
-                    device=device))
-
-    expected = torch.tensor([[] for _ in range(len(seq_group_metadata_list))],
-                            dtype=torch.int32,
-                            device=model_runner.device)
-    torch.testing.assert_close(attn_metadata.block_tables, expected)
-    # Cuda graph should not be used for prerill.
-    assert attn_metadata.use_cuda_graph is False
-
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        torch.testing.assert_close(input_tokens, input_positions)
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    assert len(input_tokens) == sum(seq_lens)
-    assert len(input_positions) == sum(seq_lens)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-    torch.allclose(input_tokens, input_positions)
-
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-@pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))
-@pytest.mark.parametrize("use_prompt_embeds", [True, False])
-def test_prepare_decode_cuda_graph(batch_size, use_prompt_embeds, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=False,
-        enable_prompt_embeds=True,
-    )
-
-    context_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    # Assume each seq group finishes prefill.
-    for i in range(batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        context_lens.append(context_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len))
-            output_embed = None
-        seq_data.update_num_computed_tokens(context_len)
-        # Append one token ID since prefill is finished.
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    slot_mapping = attn_metadata.slot_mapping
-
-    assert len(slot_mapping) == len(input_tokens)
-
-    expected_bs = model_runner.vllm_config.pad_for_cudagraph(
-        len(seq_group_metadata_list))
-    # Verify input metadata is correct for prompts.
-    device = model_runner.device
-    assert attn_metadata.num_prefills == 0
-    assert attn_metadata.num_prefill_tokens == 0
-    seq_lens = [context_len + 1 for context_len in context_lens]
-    # seq_lens are padded to expected_bs
-    for _ in range(expected_bs - len(seq_lens)):
-        seq_lens.append(1)
-    assert attn_metadata.seq_lens == seq_lens
-    assert attn_metadata.num_decode_tokens == len(seq_lens)
-    start_idx = 0
-    start_loc = [start_idx]
-    for _ in context_lens:
-        # decode has only 1 token for query.
-        start_idx += 1
-        start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.query_start_loc,
-        torch.tensor(start_loc, dtype=torch.int32, device=device))
-
-    start_idx = 0
-    seq_start_loc = [start_idx]
-    for seq_len in seq_lens:
-        start_idx += seq_len
-        seq_start_loc.append(start_idx)
-    torch.testing.assert_close(
-        attn_metadata.seq_start_loc,
-        torch.tensor(seq_start_loc, dtype=torch.int32, device=device))
-
-    torch.testing.assert_close(
-        attn_metadata.context_lens_tensor,
-        torch.tensor(context_lens, dtype=torch.int, device=device))
-    assert attn_metadata.max_decode_seq_len == max(seq_lens)
-    torch.testing.assert_close(
-        attn_metadata.seq_lens_tensor[:len(seq_lens)],
-        torch.tensor(seq_lens, dtype=torch.int, device=device))
-
-    # block table's first index corresponds to each batch, meaning in
-    # decoding it is each token.
-    assert attn_metadata.block_tables.shape[0] == len(input_tokens)
-    # Block table's second dim corresponds to each token's block number.
-    # It is padded up to
-    assert attn_metadata.block_tables.shape[1] == (
-        model_runner.get_max_block_per_batch())
-    assert attn_metadata.use_cuda_graph is True
-
-    assert len(input_tokens) == expected_bs
-    assert len(input_positions) == expected_bs
-    if use_prompt_embeds:
-        expected_input_embeds_length = start_loc[-1]
-        assert len(input_embeds) == expected_input_embeds_length
-        assert expected_input_embeds_length <= expected_bs
-    else:
-        assert input_embeds is None
-
-    # Verify Sampling
-    expected_selected_token_indices = []
-    for selected_token_start_idx, _ in enumerate(context_lens):
-        expected_selected_token_indices.append(selected_token_start_idx)
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        # query lens is all 1 for decode.
-        query_lens=[1 for _ in range(len(context_lens))],
-        device=model_runner.device,
-        pin_memory=model_runner.pin_memory)
-    actual = sampling_metadata.selected_token_indices
-    expected = torch.tensor(expected_selected_token_indices,
-                            device=actual.device,
-                            dtype=actual.dtype)
-    torch.testing.assert_close(actual, expected)
-
-
-def test_empty_seq_group():
-    """Verify prepare prompt and decode returns empty output."""
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=False,
-    )
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    attn_metadata = model_input.attn_metadata
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert attn_metadata is None
-
-    model_input = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-    return_seq_lens = model_input.seq_lens
-
-    assert input_tokens is None
-    assert input_positions is None
-    assert input_embeds is None
-    assert attn_metadata is None
-    assert return_seq_lens is None
-
-
-@pytest.fixture
-def distributed_init():
-    init_distributed_environment(
-        world_size=1,
-        rank=0,
-        distributed_init_method=f"tcp://127.0.0.1:{get_open_port()}",
-        local_rank=0)
-    ensure_model_parallel_initialized(1, 1)
-
-
-@pytest.mark.parametrize("batch_size", list(range(2, 128, 3)))
-@pytest.mark.parametrize("enforce_eager", [True, False])
-@pytest.mark.parametrize('use_prompt_embeds', [True, False])
-def test_hybrid_batches(batch_size, enforce_eager, use_prompt_embeds,
-                        distributed_init, monkeypatch):
-    if use_prompt_embeds:
-        # Prompt Embeddings is only currently supported on V0
-        monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    model_runner = _create_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        enforce_eager=enforce_eager,
-        max_num_batched_tokens=100000,
-        max_num_seqs=100000,
-        enable_chunked_prefill=True,
-        enable_prompt_embeds=True,
-    )
-
-    # Add prefill requests.
-    seq_lens: list[int] = []
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    prefill_metadata_list: list[SequenceGroupMetadata] = []
-    decode_metadata_list: list[SequenceGroupMetadata] = []
-    block_tables = {0: [1]}
-    prefill_batch_size = batch_size // 2
-    decode_batch_size = batch_size - prefill_batch_size
-    expected_input_embeds_len = 0
-    for i in range(prefill_batch_size):
-        # make sure all tokens fit into one block
-        seq_len = i % (model_runner.block_size - 1) + 1
-        seq_lens.append(seq_len)
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * seq_len,
-                prompt_embeds=torch.rand(seq_len, 10),
-            )
-            expected_input_embeds_len += seq_len
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(seq_len), )
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=True,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables=block_tables,
-        )
-        assert seq_group_metadata.token_chunk_size == seq_data.get_len()
-        seq_group_metadata_list.append(seq_group_metadata)
-        prefill_metadata_list.append(seq_group_metadata)
-
-    # Add decode requests
-    for i in range(prefill_batch_size, batch_size):
-        # make sure all tokens fit into one block
-        context_len = i % (model_runner.block_size - 1) + 1
-        if use_prompt_embeds:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=[0] * context_len,
-                prompt_embeds=torch.rand(context_len, 10),
-            )
-            output_embed = torch.rand(10)
-            # This also iterates the expected input_embeds, because the model
-            # needs both the input and output embeddings passed into together
-            expected_input_embeds_len += 1
-        else:
-            seq_data = SequenceData.from_seqs(
-                prompt_token_ids=range(context_len), )
-            output_embed = None
-        assert len(seq_data.prompt_token_ids) == context_len
-        seq_data.append_token_id(1, 0, output_embed)
-        seq_data.update_num_computed_tokens(context_len)
-        seq_group_metadata = SequenceGroupMetadata(
-            request_id=f"test_{i}",
-            is_prompt=False,
-            seq_data={0: seq_data},
-            sampling_params=SamplingParams(temperature=0),
-            block_tables={0: [1]},
-        )
-        assert seq_group_metadata.token_chunk_size == 1
-        seq_group_metadata_list.append(seq_group_metadata)
-        decode_metadata_list.append(seq_group_metadata)
-
-    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
-
-    input_tokens = model_input.input_tokens
-    input_positions = model_input.input_positions
-    input_embeds = model_input.inputs_embeds
-    attn_metadata = model_input.attn_metadata
-
-    prefill_meta_actual = attn_metadata.prefill_metadata
-    decode_meta_actual = attn_metadata.decode_metadata
-
-    assert len(attn_metadata.slot_mapping) == len(input_tokens)
-    assert len(input_positions) == len(input_tokens)
-    assert attn_metadata.num_prefills == prefill_batch_size
-    assert attn_metadata.num_decode_tokens == decode_batch_size
-    assert attn_metadata.num_prefill_tokens == sum(seq_lens)
-    if expected_input_embeds_len == 0:
-        assert input_embeds is None
-    else:
-        assert len(input_embeds) == expected_input_embeds_len
-
-    # Verify attn metadata is consistent. We don't need to test individual
-    # values here because they are tested above.
-    attn_metadata = model_runner._prepare_model_input_tensors(
-        seq_group_metadata_list).attn_metadata
-
-    for attr_expected, attr_actual in zip(vars(attn_metadata.prefill_metadata),
-                                          vars(prefill_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
-    for attr_expected, attr_actual in zip(vars(attn_metadata.decode_metadata),
-                                          vars(decode_meta_actual)):
-        assert attr_expected[1] == attr_actual[1]
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
deleted file mode 100644
index d8767f700b57..000000000000
--- a/tests/worker/test_profile.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.worker import Worker
-
-
-def test_gpu_memory_profiling():
-    # Tests the gpu profiling that happens in order to determine the number of
-    # KV cache blocks that we can allocate on the GPU.
-    # This test mocks the maximum available gpu memory so that it can run on
-    # any gpu setup.
-
-    # Set up engine args to build a worker.
-    engine_args = EngineArgs(model="facebook/opt-125m",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Set 10GiB as the total gpu ram to be device-agnostic
-    def mock_mem_info():
-        current_usage = torch.cuda.memory_stats(
-        )["allocated_bytes.all.current"]
-        mock_total_bytes = 10 * 1024**3
-        free = mock_total_bytes - current_usage
-
-        return (free, mock_total_bytes)
-
-    from unittest.mock import patch
-    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
-        # Load the model so we can profile it
-        worker.init_device()
-        worker.load_model()
-        gpu_blocks, _ = worker.determine_num_available_blocks()
-
-    # Peak vram usage by torch should be 0.47 GiB
-    # Model weights take 0.25 GiB
-    # No memory should be allocated outside of torch
-    # 9.0 GiB should be the utilization target
-    # 8.28 GiB should be available for the KV cache
-    block_size = CacheEngine.get_cache_block_size(
-        engine_config.cache_config, engine_config.model_config,
-        engine_config.parallel_config)
-
-    expected_blocks = (8.28 * 1024**3) // block_size
-
-    # Check within a small tolerance for portability
-    # Hardware, kernel, or dependency changes could all affect memory
-    # utilization.
-    # A 100 block tolerance here should be about 60MB of wiggle room.
-    assert abs(gpu_blocks - expected_blocks) < 100
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
deleted file mode 100644
index 6d9f404ac207..000000000000
--- a/tests/worker/test_swap.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-from vllm.engine.arg_utils import EngineArgs
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def test_swap() -> None:
-    # Configure the engine.
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             dtype="half",
-                             load_format="dummy")
-    engine_config = engine_args.create_engine_config()
-    engine_config.cache_config.num_gpu_blocks = 1000
-    engine_config.cache_config.num_cpu_blocks = 1000
-
-    # Create the worker.
-    distributed_init_method = get_distributed_init_method(
-        get_ip(), get_open_port())
-    worker = Worker(
-        vllm_config=engine_config,
-        local_rank=0,
-        rank=0,
-        distributed_init_method=distributed_init_method,
-        is_driver_worker=True,
-    )
-
-    # Initialize the worker.
-    worker.init_device()
-    worker.load_model()
-    worker.initialize_cache(
-        num_gpu_blocks=engine_config.cache_config.num_gpu_blocks,
-        num_cpu_blocks=engine_config.cache_config.num_cpu_blocks)
-
-    # Randomly initialize the cache.
-    gpu_cache = worker.cache_engine[0].gpu_cache
-    cpu_cache = worker.cache_engine[0].cpu_cache
-    num_layers = len(gpu_cache)
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        gpu_key_cache.random_()
-        gpu_value_cache.random_()
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        cpu_key_cache.random_()
-        cpu_value_cache.random_()
-
-    allclose = lambda a, b: torch.allclose(
-        a.cuda(), b.cuda(), rtol=0.0, atol=0.0)
-
-    # Test swap out.
-    blocks_to_swap_out = [(3, 72), (56, 35), (84, 34)]
-    execute_model_req = ExecuteModelRequest(
-        seq_group_metadata_list=[],
-        blocks_to_swap_in=[],
-        blocks_to_swap_out=blocks_to_swap_out,
-        blocks_to_copy=[],
-    )
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in blocks_to_swap_out:
-            assert allclose(gpu_key_cache[src], cpu_key_cache[dst])
-            assert allclose(gpu_value_cache[src], cpu_value_cache[dst])
-
-    # Test swap in.
-    execute_model_req.blocks_to_swap_out = []
-    execute_model_req.blocks_to_swap_in = [
-        (19, 45),
-        (67, 23),
-        (12, 78),
-        (40, 99),
-        (1, 71),
-    ]
-    worker.execute_model(execute_model_req=execute_model_req)
-
-    for i in range(num_layers):
-        gpu_key_cache, gpu_value_cache = gpu_cache[i]
-        cpu_key_cache, cpu_value_cache = cpu_cache[i]
-        for src, dst in execute_model_req.blocks_to_swap_in:
-            assert allclose(gpu_key_cache[dst], cpu_key_cache[src])
-            assert allclose(gpu_value_cache[dst], cpu_value_cache[src])
diff --git a/tools/check_pickle_imports.py b/tools/check_pickle_imports.py
index ef197d1fbace..b38687b94b33 100644
--- a/tools/check_pickle_imports.py
+++ b/tools/check_pickle_imports.py
@@ -37,10 +37,8 @@
     'tests/model_executor/test_guided_processors.py',
     'vllm/distributed/utils.py',
     'vllm/distributed/parallel_state.py',
-    'vllm/engine/multiprocessing/client.py',
     'vllm/distributed/device_communicators/custom_all_reduce_utils.py',
     'vllm/distributed/device_communicators/shm_broadcast.py',
-    'vllm/engine/multiprocessing/engine.py',
     'benchmarks/kernels/graph_machete_bench.py',
     'benchmarks/kernels/benchmark_lora.py',
     'benchmarks/kernels/benchmark_machete.py',
@@ -48,8 +46,6 @@
     'benchmarks/cutlass_benchmarks/w8a8_benchmarks.py',
     'benchmarks/cutlass_benchmarks/sparse_benchmarks.py',
     # cloudpickle
-    'vllm/worker/worker_base.py',
-    'vllm/executor/mp_distributed_executor.py',
     'vllm/executor/ray_distributed_executor.py',
     'vllm/entrypoints/llm.py',
     'tests/utils.py',
@@ -59,8 +55,6 @@
     'vllm/v1/executor/multiproc_executor.py',
     'vllm/transformers_utils/config.py',
     'vllm/model_executor/models/registry.py',
-    'vllm/engine/multiprocessing/client.py',
-    'vllm/engine/multiprocessing/engine.py',
 ])
 
 PICKLE_RE = re.compile(r"^\s*(import\s+(pickle|cloudpickle)(\s|$|\sas)"
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
deleted file mode 100755
index ee36fd19e012..000000000000
--- a/vllm/attention/backends/flash_attn.py
+++ /dev/null
@@ -1,1005 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with FlashAttention."""
-from collections import defaultdict
-from dataclasses import dataclass
-from itertools import accumulate
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
-
-import torch
-
-from vllm import _custom_ops as ops
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionType,
-                                              is_quantized_kv_cache)
-# yapf: enable
-from vllm.attention.backends.utils import (
-    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
-    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
-    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
-    is_all_encoder_attn_metadata_set, is_block_tables_empty)
-from vllm.attention.utils.fa_utils import (flash_attn_supports_fp8,
-                                           get_flash_attn_version)
-from vllm.logger import init_logger
-from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
-from vllm.vllm_flash_attn import (flash_attn_varlen_func,
-                                  flash_attn_with_kvcache)
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
-
-logger = init_logger(__name__)
-
-
-class FlashAttentionBackend(AttentionBackend):
-
-    accept_output_buffer: bool = True
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
-
-    @staticmethod
-    def get_name() -> str:
-        return "FLASH_ATTN"
-
-    @staticmethod
-    def get_impl_cls() -> Type["FlashAttentionImpl"]:
-        return FlashAttentionImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return FlashAttentionMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
-        return FlashAttentionMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
-
-
-@dataclass
-class FlashAttentionMetadata(AttentionMetadata):
-    """Metadata for FlashAttentionBackend.
-
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor]
-
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-
-    use_cuda_graph: bool
-
-    # Maximum query length in the batch.
-    max_query_len: Optional[int] = None
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int] = None
-
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor] = None
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor] = None
-
-    _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    encoder_seq_start_loc: Optional[torch.Tensor] = None
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        '''
-        All attention metadata required for encoder attention is set.
-        '''
-        return is_all_encoder_attn_metadata_set(self)
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        '''
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        '''
-        return is_all_cross_attn_metadata_set(self)
-
-    @property
-    def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            return self._cached_prefill_metadata
-
-        assert ((self.seq_lens is not None)
-                or (self.encoder_seq_lens is not None))
-        assert ((self.seq_lens_tensor is not None)
-                or (self.encoder_seq_lens_tensor is not None))
-
-        # Compute some attn_metadata fields which default to None
-        query_start_loc = (None if self.query_start_loc is None else
-                           self.query_start_loc[:self.num_prefills + 1])
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[:self.num_prefill_tokens])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[:self.num_prefills])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[:self.num_prefills])
-        seq_start_loc = (None if self.seq_start_loc is None else
-                         self.seq_start_loc[:self.num_prefills + 1])
-        context_lens_tensor = (None if self.context_lens_tensor is None else
-                               self.context_lens_tensor[:self.num_prefills])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[:self.num_prefills])
-
-        self._cached_prefill_metadata = FlashAttentionMetadata(
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_query_len=0,
-            max_decode_seq_len=0,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=False,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            encoder_seq_start_loc=self.encoder_seq_start_loc,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            return self._cached_decode_metadata
-        assert ((self.seq_lens_tensor is not None)
-                or (self.encoder_seq_lens_tensor is not None))
-
-        # Compute some attn_metadata fields which default to None
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[self.num_prefill_tokens:])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[self.num_prefills:])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[self.num_prefills:])
-
-        self._cached_decode_metadata = FlashAttentionMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            seq_lens=None,
-            seq_lens_tensor=seq_lens_tensor,
-            max_decode_query_len=self.max_decode_query_len,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            # Batch may be composed of prefill|decodes, adjust query start
-            # indices to refer to the start of decodes. E.g.
-            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-            query_start_loc=(self.query_start_loc[self.num_prefills:] -
-                             self.query_start_loc[self.num_prefills])
-            if self.query_start_loc is not None else None,
-            seq_start_loc=self.seq_start_loc[self.num_prefills:]
-            if self.seq_start_loc is not None else None,
-            context_lens_tensor=None,
-            block_tables=block_tables,
-            use_cuda_graph=self.use_cuda_graph,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            encoder_seq_start_loc=self.encoder_seq_start_loc,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-        return self._cached_decode_metadata
-
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
-
-class FlashAttentionMetadataBuilder(
-        AttentionMetadataBuilder[FlashAttentionMetadata]):
-
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-        self.has_prefix_cache_hit = False
-
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-
-            if is_prompt:
-                mm_maps = inter_data.multi_modal_placeholder_maps
-                if mm_maps:
-                    for modality, placeholders in mm_maps.items():
-                        self.multimodal_placeholder_maps[modality].extend(
-                            placeholders)
-
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if prefix_cache_hit:
-                # NOTE(woosuk): For flash-attn, the block table should
-                # include the entries for the incoming prefill tokens.
-                block_table = block_tables[seq_id]
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                if curr_sliding_window_block == 0:
-                    block_table = block_tables[seq_id]
-                else:
-                    block_table = block_tables[seq_id][
-                        -curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            # Compute slot mapping.
-            is_profile_run = is_block_tables_empty(block_tables)
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-
-    def _get_graph_runner_block_tables(
-            self, num_seqs: int,
-            block_tables: List[List[int]]) -> torch.Tensor:
-        # The shape of graph_block_tables is
-        # [max batch size, max context len // block size].
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs
-
-        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
-        for i, block_table in enumerate(block_tables):
-            if block_table:
-                num_blocks = len(block_table)
-                if num_blocks <= max_blocks:
-                    graph_block_tables[i, :num_blocks] = block_table
-                else:
-                    # It may be possible to have more blocks allocated due
-                    # to lookahead slots of multi-step, however, they are
-                    # not used anyway, so can be safely ignored.
-                    graph_block_tables[
-                        i, :max_blocks] = block_table[:max_blocks]
-
-        return torch.from_numpy(graph_block_tables).to(
-            device=self.runner.device, non_blocking=True)
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        prefix_cache_hit = any([
-            inter_data.prefix_cache_hit
-            for inter_data in self.input_builder.inter_data_list
-        ])
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled,
-                                prefix_cache_hit)
-
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-
-        max_query_len = max(query_lens)
-        decode_query_lens = query_lens[self.num_prefills:]
-        if len(decode_query_lens) > 0:
-            max_decode_query_len = max(decode_query_lens)
-        else:
-            max_decode_query_len = 1
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        query_start_loc = list(accumulate(query_lens, initial=0))
-        seq_start_loc = list(accumulate(seq_lens, initial=0))
-
-        num_seqs = len(seq_lens)
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-            block_tables = self._get_graph_runner_block_tables(
-                num_seqs, self.block_tables)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
-
-        assert device is not None
-        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
-                                               device, self.runner.pin_memory)
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
-                                                  device,
-                                                  self.runner.pin_memory)
-        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
-                                                device, self.runner.pin_memory)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-
-        return FlashAttentionMetadata(
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=True,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_decode_query_len=max_decode_query_len,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc_tensor,
-            seq_start_loc=seq_start_loc_tensor,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=use_captured_graph,
-        )
-
-
-class FlashAttentionImpl(AttentionImpl):
-    """
-    If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prefill_tokens ----------------->|	
-    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
-
-    Otherwise, the layout is as follows:	
-    |<----------------- num_decode_tokens ------------------>|	
-    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
-
-    Generation tokens can contain padding when cuda-graph is used.
-    Currently, prompt tokens don't contain any padding.
-
-    The prompts might have different lengths, while the generation tokens
-    always have length 1.
-
-    If chunked prefill is enabled, prefill tokens and decode tokens can be
-    batched together in a flattened 1D query.
-
-    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
-    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
-
-    Currently, cuda graph is disabled for chunked prefill, meaning there's no
-    padding between prefill and decode tokens.
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "FLASH_ATTN backend.")
-        if use_irope:
-            logger.warning(
-                "Using irope in V0 is not supported yet, it will fall back "
-                "to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = ((sliding_window - 1,
-                                0) if sliding_window is not None else (-1, -1))
-        self.kv_cache_dtype = kv_cache_dtype
-        self.vllm_flash_attn_version = get_flash_attn_version(
-            requires_alibi=self.alibi_slopes is not None)
-        if is_quantized_kv_cache(self.kv_cache_dtype) and (
-                not self.kv_cache_dtype.startswith("fp8")
-                or not flash_attn_supports_fp8()):
-            raise NotImplementedError(
-                f"FlashAttention does not support {self.kv_cache_dtype} "
-                "kv-cache on this device "
-                f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
-        if logits_soft_cap is None:
-            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        support_head_sizes = FlashAttentionBackend.get_supported_head_sizes()
-        if head_size not in support_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}.")
-        self.attn_type = attn_type
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: FlashAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with FlashAttention.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            output: shape = [num_tokens, num_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-        NOTE: It in-place updates the output tensor.
-        NOTE: FP8 quantization, flash-attn expect the size of
-              {q,k,v}_descale to be (num_sequences, num_kv_heads).
-              We use torch's .expand() to avoid duplicating values
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashAttentionImpl")
-
-        # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache.
-        if not flash_attn_supports_fp8() or output.dtype != torch.bfloat16:
-            assert (
-                layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), (
-                    "key/v_scale is only supported in FlashAttention 3 with "
-                    "base dtype bfloat16")
-
-        attn_type = self.attn_type
-        if (attn_type == AttentionType.ENCODER
-                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
-            raise AttributeError("Encoder attention requires setting "
-                                 "encoder metadata attributes.")
-        elif (attn_type == AttentionType.ENCODER_DECODER
-              and (not attn_metadata.is_all_cross_attn_metadata_set)):
-            raise AttributeError("Encoder/decoder cross-attention "
-                                 "requires setting cross-attention "
-                                 "metadata attributes.")
-
-        kv_cache_dtype: str = self.kv_cache_dtype
-        softmax_scale: float = self.scale
-        window_size = self.sliding_window
-        alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
-        logits_soft_cap: Optional[float] = self.logits_soft_cap
-        fp8_attention = kv_cache_dtype.startswith("fp8")
-
-        if fp8_attention and not flash_attn_supports_fp8():
-            raise NotImplementedError(
-                "FlashAttention does not support FP8 kv-cache on this device.")
-
-        if kv_cache.numel() > 0:
-            key_cache = kv_cache[0]
-            value_cache = kv_cache[1]
-            # We skip updating the KV cache under two conditions:
-            #  a. When the Attention Type is ENCODER. In this phase, we compute
-            #     only the encoder attention without updating the cache.
-            #  b. When both Key and Value are None. This occurs during
-            #     cross-attention computation in the decoding phase, where the
-            #     KV cache is already populated with the cross-attention
-            #     tensor. Thus, we skip cache updates during this time.
-            if (attn_type != AttentionType.ENCODER) and (key is not None) and (
-                    value is not None):
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                # Reshape the input keys and values and store them in the cache.
-                # If kv_cache is not provided, the new key and value tensors are
-                # not cached. This happens during the initial memory
-                # profiling run.
-                torch.ops._C_cache_ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    kv_cache[0],
-                    kv_cache[1],
-                    updated_slot_mapping.flatten(),  # type: ignore[union-attr]
-                    kv_cache_dtype,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-
-                if fp8_attention:
-                    kv_cache = kv_cache.view(torch.float8_e4m3fn)
-                    key_cache = key_cache.view(torch.float8_e4m3fn)
-                    value_cache = value_cache.view(torch.float8_e4m3fn)
-
-        if fp8_attention:
-            num_tokens, num_heads, head_size = query.shape
-            query, _ = ops.scaled_fp8_quant(
-                query.reshape(
-                    (num_tokens, num_heads * head_size)).contiguous(),
-                layer._q_scale)
-            query = query.reshape((num_tokens, num_heads, head_size))
-
-        (num_prefill_query_tokens, num_prefill_kv_tokens,
-        num_decode_query_tokens) = \
-            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
-        decode_query = query[num_prefill_query_tokens:]
-        decode_output = output[num_prefill_query_tokens:]
-        # QKV for prefill.
-        query = query[:num_prefill_query_tokens]
-        prefill_output = output[:num_prefill_query_tokens]
-        assert query.shape[0] == num_prefill_query_tokens
-        assert decode_query.shape[0] == num_decode_query_tokens
-
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            if (kv_cache.numel() == 0 or prefill_meta.block_tables is None
-                    or prefill_meta.block_tables.numel() == 0):
-                # normal attention
-                # When block_tables are not filled, it means q and k are the
-                # prompt, and they have the same length.
-                q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \
-                    _get_query_key_seq_metadata(prefill_meta, True, attn_type)
-
-                key = key[:num_prefill_kv_tokens]
-                value = value[:num_prefill_kv_tokens]
-
-                if fp8_attention:
-                    num_kv_tokens, num_kv_heads, head_size = key.shape
-
-                    key, _ = ops.scaled_fp8_quant(
-                        key.reshape((num_kv_tokens,
-                                     num_kv_heads * head_size)).contiguous(),
-                        layer._k_scale)
-                    key = key.reshape((num_kv_tokens, num_kv_heads, head_size))
-
-                    value, _ = ops.scaled_fp8_quant(
-                        value.reshape((num_kv_tokens,
-                                       num_kv_heads * head_size)).contiguous(),
-                        layer._v_scale)
-                    value = value.reshape(
-                        (num_kv_tokens, num_kv_heads, head_size))
-
-                descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1])
-                flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=q_seq_start_loc,
-                    cu_seqlens_k=k_seq_start_loc,
-                    max_seqlen_q=q_seq_len,
-                    max_seqlen_k=k_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=_get_causal_option(attn_type),
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    softcap=logits_soft_cap,
-                    out=prefill_output,
-                    fa_version=self.vllm_flash_attn_version,
-                    q_descale=layer._q_scale.expand(descale_shape),
-                    k_descale=layer._k_scale.expand(descale_shape),
-                    v_descale=layer._v_scale.expand(descale_shape),
-                )
-            else:
-                # prefix-enabled attention
-                assert attn_type == AttentionType.DECODER, (
-                    "Only decoder-only models support prefix caching")
-                assert prefill_meta.seq_lens is not None
-                assert prefill_meta.query_start_loc is not None
-                max_seq_len = max(prefill_meta.seq_lens)
-                descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
-                                 key.shape[1])
-                flash_attn_varlen_func(  # noqa
-                    q=query,
-                    k=key_cache,
-                    v=value_cache,
-                    cu_seqlens_q=prefill_meta.query_start_loc,
-                    max_seqlen_q=prefill_meta.max_query_len,
-                    seqused_k=prefill_meta.seq_lens_tensor,
-                    max_seqlen_k=max_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    block_table=prefill_meta.block_tables,
-                    softcap=logits_soft_cap,
-                    out=prefill_output,
-                    fa_version=self.vllm_flash_attn_version,
-                    q_descale=layer._q_scale.expand(descale_shape),
-                    k_descale=layer._k_scale.expand(descale_shape),
-                    v_descale=layer._v_scale.expand(descale_shape),
-                )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Decoding run.
-            # Use flash_attn_varlen_func kernel for speculative decoding
-            # because different queries might have different lengths.
-
-            assert decode_meta.max_decode_query_len is not None
-            # use only for actual varlen decoding
-            if decode_meta.max_decode_query_len > 1:
-                assert attn_type == AttentionType.DECODER, (
-                    "Only decoder-only models support max_decode_query_len > 1"
-                )
-                assert decode_meta.query_start_loc is not None
-                descale_shape = (decode_meta.query_start_loc.shape[0] - 1,
-                                 key.shape[1])
-                flash_attn_varlen_func(
-                    q=decode_query,
-                    k=key_cache,
-                    v=value_cache,
-                    cu_seqlens_q=decode_meta.query_start_loc,
-                    max_seqlen_q=decode_meta.max_decode_query_len,
-                    seqused_k=decode_meta.seq_lens_tensor,
-                    max_seqlen_k=decode_meta.max_decode_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    softcap=logits_soft_cap,
-                    block_table=decode_meta.block_tables,
-                    out=decode_output,
-                    fa_version=self.vllm_flash_attn_version,
-                    q_descale=layer._q_scale.expand(descale_shape),
-                    k_descale=layer._k_scale.expand(descale_shape),
-                    v_descale=layer._v_scale.expand(descale_shape),
-                )
-            else:
-                # Use flash_attn_with_kvcache for normal decoding.
-                (
-                    seq_lens_arg,
-                    _,
-                    block_tables_arg,
-                ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
-                descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2])
-                flash_attn_with_kvcache(
-                    q=decode_query.unsqueeze(1),
-                    k_cache=key_cache,
-                    v_cache=value_cache,
-                    block_table=block_tables_arg,
-                    cache_seqlens=seq_lens_arg,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                    softcap=logits_soft_cap,
-                    out=decode_output.unsqueeze(1),
-                    fa_version=self.vllm_flash_attn_version,
-                    q_descale=layer._q_scale.expand(descale_shape),
-                    k_descale=layer._k_scale.expand(descale_shape),
-                    v_descale=layer._v_scale.expand(descale_shape),
-                )
-        return output
-
-
-def _get_query_key_seq_metadata(
-    attn_metadata,
-    is_prompt: bool,
-    attn_type: str,
-) -> tuple:
-    """
-    Returns sequence metadata for key and query based on the specified 
-    attention type and whether input is a prompt.
-
-    This function computes the starting locations and maximum sequence lengths 
-    for key and query sequences for different attention types.
-
-    Args:
-        attn_metadata: The attention metadata object
-        is_prompt (bool): A flag indicating if the input is a prompt
-        attn_type (AttentionType): The type of attention being used.
-
-    Returns:
-        tuple: A tuple containing four integers:
-            - Starting location for the query sequence.
-            - Maximum sequence length for the query sequence.
-            - Starting location for the key sequence.
-            - Maximum sequence length for the key sequence.
-
-    Raises:
-        AttributeError: If an invalid attention type is provided.
-    """
-    if attn_type == AttentionType.DECODER:
-        # Decoder self-attention
-        # Choose max_seq_len based on whether we are in prompt_run
-        if is_prompt:
-            max_seq_len = attn_metadata.max_prefill_seq_len
-        else:
-            max_seq_len = attn_metadata.max_decode_seq_len
-        return (attn_metadata.seq_start_loc, max_seq_len,
-                attn_metadata.seq_start_loc, max_seq_len)
-
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        # This is cross attention between the where the key
-        # is the precomputed encoder attention and query
-        # is the input sequence.
-        # Choose query max length based on whether it is prompt
-        # or not.
-        if is_prompt:
-            max_seq_len = attn_metadata.max_prefill_seq_len
-        else:
-            max_seq_len = attn_metadata.max_decode_seq_len
-        return (attn_metadata.seq_start_loc, max_seq_len,
-                attn_metadata.encoder_seq_start_loc,
-                attn_metadata.max_encoder_seq_len)
-    elif attn_type == AttentionType.ENCODER:
-        # For encoder attention both the query and the key are same i.e the
-        # encoder sequence.
-        return (attn_metadata.encoder_seq_start_loc,
-                attn_metadata.max_encoder_seq_len,
-                attn_metadata.encoder_seq_start_loc,
-                attn_metadata.max_encoder_seq_len)
-    elif attn_type == AttentionType.ENCODER_ONLY:
-        assert is_prompt, "Should not have decode for encoder only model."
-        return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len,
-                attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len)
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-def _get_causal_option(attn_type: str) -> bool:
-    """
-    Determine whether the given attention type is suitable for causal 
-    attention mechanisms.
-
-    Args:
-        attn_type (AttentionType): The type of attention being evaluated
-
-    Returns:
-        bool: Returns `True` if the attention type is suitable for causal 
-        attention (i.e., not encoder, encoder-only, or encoder-decoder), 
-        otherwise returns `False`.
-    """
-    return not (attn_type == AttentionType.ENCODER
-                or attn_type == AttentionType.ENCODER_ONLY
-                or attn_type == AttentionType.ENCODER_DECODER)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
deleted file mode 100644
index e6e60e756248..000000000000
--- a/vllm/attention/backends/flashinfer.py
+++ /dev/null
@@ -1,1194 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type
-
-from vllm.multimodal import MultiModalPlaceholderMap
-
-try:
-    from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-    from flashinfer.decode import (CUDAGraphBatchDecodeWithPagedKVCacheWrapper,
-                                   trtllm_batch_decode_with_kv_cache)
-    from flashinfer.prefill import BatchPrefillWithPagedKVCacheWrapper
-
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
-except ImportError:
-    # Avoid turning these types into variables during type checking
-    if not TYPE_CHECKING:
-        BatchDecodeWithPagedKVCacheWrapper = None
-        CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None
-        BatchPrefillWithPagedKVCacheWrapper = None
-        trtllm_batch_decode_with_kv_cache = None
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
-    raise ImportError("FlashInfer is not installed. Please install it from "
-                      "https://github.com/flashinfer-ai/flashinfer") from None
-
-import torch
-
-import vllm.envs as envs
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionState, AttentionType)
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
-from vllm.attention.layer import Attention
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
-                        make_tensor_with_pad)
-
-logger = init_logger(__name__)
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
-
-
-class FlashInferBackend(AttentionBackend):
-    cached_sm100a_supported: Optional[bool] = None
-
-    @staticmethod
-    def get_name() -> str:
-        return "FLASHINFER"
-
-    @staticmethod
-    def get_impl_cls() -> Type["FlashInferImpl"]:
-        return FlashInferImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return FlashInferMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["FlashInferMetadataBuilder"]:
-        return FlashInferMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["FlashInferState"]:
-        return FlashInferState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, 2, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_kv_cache_stride_order() -> Tuple[int, ...]:
-        cache_layout = FlashInferState.get_kv_cache_layout()
-        assert (cache_layout in ("NHD", "HND"))
-        stride_order = (0, 1, 2, 3, 4) if cache_layout == "NHD" else (0, 1, 3,
-                                                                      2, 4)
-        return stride_order
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [64, 128, 256]
-
-    @staticmethod
-    def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype:
-        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-            return torch.float8_e4m3fn
-        elif kv_cache_dtype == "fp8_e5m2":
-            return torch.float8_e5m2
-        else:
-            raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
-
-    @staticmethod
-    def use_trtllm_decode_attention(
-        batch_size: int,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_qo_heads: Optional[int],
-        num_kv_heads: Optional[int],
-        attn_head_size: Optional[int],
-    ) -> bool:
-        if FlashInferBackend.cached_sm100a_supported is None:
-            FlashInferBackend.cached_sm100a_supported = (
-                current_platform.has_device_capability(100))
-        if not FlashInferBackend.cached_sm100a_supported:
-            return False
-        # Check if the dimensions are supported by TRTLLM decode attention
-        if (attn_head_size is None or num_qo_heads is None
-                or num_kv_heads is None or num_qo_heads // num_kv_heads > 8
-                or num_qo_heads % num_kv_heads != 0 or attn_head_size != 128):
-            return False
-        env_value = envs.VLLM_USE_TRTLLM_DECODE_ATTENTION
-        if env_value is not None:
-            logger.info_once("VLLM_USE_TRTLLM_DECODE_ATTENTION is set to %s",
-                             env_value)
-            # Environment variable is set - respect it
-            # Making the conditional check for zero because
-            # the path is automatically enabled if the batch size condition
-            # is satisfied.
-            no_use_trtllm = (env_value == "0")
-            if not no_use_trtllm:
-                logger.info_once("Using TRTLLM decode attention.")
-            return not no_use_trtllm
-        else:
-            # Environment variable not set - use auto-detection
-            use_trtllm = (FlashInferBackend.cached_sm100a_supported
-                          and batch_size <= 256 and max_seq_len < 131072
-                          and kv_cache_dtype == "auto")
-            if use_trtllm:
-                logger.warning_once(
-                    "Using TRTLLM decode attention (auto-detected).")
-        return use_trtllm
-
-
-@dataclass
-class PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
-    """
-
-    window_left: int
-    logits_soft_cap: Optional[float]
-    sm_scale: float
-
-
-def get_per_layer_parameters(
-        vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]:
-    """
-    Scan all attention layers and determine some hyperparameters
-    to use during `plan`.
-    """
-
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
-    per_layer_params: Dict[str, PerLayerParameters] = {}
-
-    for key, layer in layers.items():
-        impl = layer.impl
-        assert isinstance(impl, FlashInferImpl)
-
-        # Infer hyperparameters from the attention layer
-        window_size = impl.sliding_window
-        window_left = window_size[0] if window_size is not None else -1
-        logits_soft_cap = impl.logits_soft_cap
-        sm_scale = impl.scale
-
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
-
-    return per_layer_params
-
-
-def infer_global_hyperparameters(
-        per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters:
-    - `window_left`
-    - `logits_soft_cap`
-    - `sm_scale`
-
-    So this function asserts that all layers share the same values for these
-    hyperparameters and returns the global values.
-    """
-
-    assert len(per_layer_params) > 0, "No attention layers found in the model."
-
-    param_sets = list(per_layer_params.values())
-    global_params = param_sets[0]
-    for params in param_sets:
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
-
-    return global_params
-
-
-class FlashInferState(AttentionState):
-
-    def __init__(self, runner):
-        self.runner = runner
-        self._is_graph_capturing = False
-        self._workspace_buffer = None
-        self._decode_wrapper = None
-        self._prefill_wrapper = None
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-        self._kv_cache_layout = None
-
-    def _get_workspace_buffer(self):
-        if self._workspace_buffer is None:
-            self._workspace_buffer = torch.empty(
-                FLASHINFER_WORKSPACE_BUFFER_SIZE,
-                dtype=torch.uint8,
-                device=self.runner.device)
-        return self._workspace_buffer
-
-    @staticmethod
-    def get_kv_cache_layout():
-        from vllm.v1.attention.backends.utils import _KV_CACHE_LAYOUT_OVERRIDE
-        if _KV_CACHE_LAYOUT_OVERRIDE is not None:
-            logger.info_once("Using KV cache layout %s",
-                             _KV_CACHE_LAYOUT_OVERRIDE)
-            return _KV_CACHE_LAYOUT_OVERRIDE
-        cache_layout = envs.VLLM_KV_CACHE_LAYOUT
-        if cache_layout is None:
-            logger.info_once("Using default KV cache layout NHD")
-            return "NHD"
-        logger.info_once("Using KV cache layout %s", cache_layout)
-        return cache_layout
-
-    def _get_prefill_wrapper(self):
-        if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), self.get_kv_cache_layout())
-        return self._prefill_wrapper
-
-    def _get_decode_wrapper(self):
-        if self._decode_wrapper is None:
-            num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config))
-            num_kv_heads = self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config)
-            use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-                num_qo_heads // num_kv_heads > 4)
-            self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(),
-                self.get_kv_cache_layout(),
-                use_tensor_cores=use_tensor_cores)
-        return self._decode_wrapper
-
-    @contextmanager
-    def graph_capture(self, max_batch_size: int):
-        self._is_graph_capturing = True
-        self._graph_decode_wrapper = None
-        self._graph_slot_mapping = torch.full((max_batch_size, ),
-                                              PAD_SLOT_ID,
-                                              dtype=torch.long,
-                                              device=self.runner.device)
-        self._graph_seq_lens = torch.ones(max_batch_size,
-                                          dtype=torch.int32,
-                                          device=self.runner.device)
-        self._graph_block_tables = torch.from_numpy(
-            self.runner.graph_block_tables).to(device=self.runner.device)
-        self._graph_decode_workspace_buffer = self._get_workspace_buffer()
-        self._graph_indices_buffer = torch.empty(
-            max_batch_size * self.runner.cache_config.num_gpu_blocks,
-            dtype=torch.int32,
-            device=self.runner.device)
-        self._graph_indptr_buffer = torch.empty(max_batch_size + 1,
-                                                dtype=torch.int32,
-                                                device=self.runner.device)
-        self._graph_last_page_len_buffer = torch.empty(
-            max_batch_size, dtype=torch.int32, device=self.runner.device)
-        yield
-        self._is_graph_capturing = False
-        del self._graph_slot_mapping
-        del self._graph_seq_lens
-        del self._graph_block_tables
-        del self._graph_decode_workspace_buffer
-        del self._graph_indices_buffer
-        del self._graph_indptr_buffer
-        del self._graph_last_page_len_buffer
-        del self._graph_decode_wrapper
-
-    def graph_clone(self, batch_size: int):
-        assert self._is_graph_capturing
-        state = self.__class__(self.runner)
-        state._workspace_buffer = self._graph_decode_workspace_buffer
-        state._decode_wrapper = self._graph_decode_wrapper
-        state._prefill_wrapper = self._get_prefill_wrapper()
-        return state
-
-    def graph_capture_get_metadata_for_batch(
-            self, batch_size: int, is_encoder_decoder_model: bool = False):
-        assert self._is_graph_capturing
-        _indptr_buffer = self._graph_indptr_buffer[:batch_size + 1]
-        _last_page_len_buffer = self._graph_last_page_len_buffer[:batch_size]
-
-        num_qo_heads = (self.runner.model_config.get_num_attention_heads(
-            self.runner.parallel_config))
-        num_kv_heads = self.runner.model_config.get_num_kv_heads(
-            self.runner.parallel_config)
-        use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or (
-            num_qo_heads // num_kv_heads > 4)
-        self._graph_decode_wrapper = \
-            CUDAGraphBatchDecodeWithPagedKVCacheWrapper(
-            self._graph_decode_workspace_buffer, _indptr_buffer,
-            self._graph_indices_buffer, _last_page_len_buffer,
-            self.get_kv_cache_layout(),
-            use_tensor_cores)
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        paged_kv_indptr_tensor_host = torch.arange(0,
-                                                   batch_size + 1,
-                                                   dtype=torch.int32)
-        paged_kv_indices_tensor_host = torch.arange(0,
-                                                    batch_size,
-                                                    dtype=torch.int32)
-        paged_kv_last_page_len_tensor_host = torch.full((batch_size, ),
-                                                        self.runner.block_size,
-                                                        dtype=torch.int32)
-        query_start_loc_host = torch.arange(0,
-                                            batch_size + 1,
-                                            dtype=torch.int32)
-
-        global_params = infer_global_hyperparameters(
-            get_per_layer_parameters(self.vllm_config))
-
-        attn_metadata = self.runner.attn_backend.make_metadata(
-            num_prefills=0,
-            slot_mapping=self._graph_slot_mapping[:batch_size],
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=0,
-            seq_lens_tensor=self._graph_seq_lens,
-            block_tables=self._graph_block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor_host,
-            paged_kv_indices=paged_kv_indices_tensor_host,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor_host,
-            num_qo_heads=num_qo_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.runner.block_size,
-            seq_start_loc=None,
-            query_start_loc=query_start_loc_host,
-            device=self.runner.device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=True,
-            decode_wrapper=self._graph_decode_wrapper,
-            prefill_wrapper=None,
-            **dataclasses.asdict(global_params),
-        )
-        attn_metadata.begin_forward()
-        return attn_metadata
-
-    def get_graph_input_buffers(self,
-                                attn_metadata,
-                                is_encoder_decoder_model: bool = False):
-        return {
-            "block_tables": attn_metadata.block_tables,
-            "seq_lens_tensor": attn_metadata.seq_lens_tensor,
-            "slot_mapping": attn_metadata.slot_mapping,
-        }
-
-    def prepare_graph_input_buffers(self,
-                                    input_buffers,
-                                    attn_metadata,
-                                    is_encoder_decoder_model: bool = False):
-        # FlashInfer-specific logic: copy additional tensors
-        num_total_blocks = attn_metadata.decode_metadata.seq_lens_tensor.shape[
-            0]
-        input_buffers["seq_lens_tensor"][:num_total_blocks].copy_(
-            attn_metadata.seq_lens_tensor, non_blocking=True)
-        input_buffers["block_tables"][:num_total_blocks].copy_(
-            attn_metadata.block_tables, non_blocking=True)
-
-    def begin_forward(self, model_input):
-        assert not self._is_graph_capturing
-        state = self
-        use_cuda_graph = model_input.attn_metadata.use_cuda_graph
-        is_decode = model_input.attn_metadata.num_prefills == 0
-        # In case of multistep chunked-prefill, there might be prefill requests
-        # scheduled while CUDA graph mode is enabled. We don't run graph in that
-        # case.
-        if use_cuda_graph and is_decode:
-            if model_input.inputs_embeds is None:
-                batch_size = model_input.input_tokens.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, False)].attn_state)
-            else:
-                batch_size = model_input.inputs_embeds.shape[0]
-                state = (
-                    self.runner.graph_runners[model_input.virtual_engine][(
-                        batch_size, True)].attn_state)
-
-        model_input.attn_metadata.prefill_wrapper = state._get_prefill_wrapper(
-        )
-        model_input.attn_metadata.decode_wrapper = state._get_decode_wrapper()
-        model_input.attn_metadata.begin_forward()
-
-
-@dataclass
-class FlashInferMetadata(AttentionMetadata):
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    max_decode_seq_len: int
-
-    # Number of query tokens for each request in the batch.
-    # Currently, we require that all requests have the same number of query
-    # tokens during the decoding phase. When speculavie decoding is enabled,
-    # decode_query_len might be greater than 1. In all other cases, it is 1.
-    decode_query_len: Optional[int] = 1
-
-    use_cuda_graph: bool = True
-
-    prefill_wrapper: Optional[BatchPrefillWithPagedKVCacheWrapper] = None
-    decode_wrapper: Optional[BatchDecodeWithPagedKVCacheWrapper] = None
-
-    # Metadata for the prefill stage
-    seq_start_loc: Optional[torch.Tensor] = None
-    query_start_loc: Optional[torch.Tensor] = None
-    block_tables: Optional[torch.Tensor] = None
-
-    # used for GPU in-place advance_step
-    seq_lens_tensor: Optional[torch.Tensor] = None
-    block_table_bound: Optional[torch.Tensor] = None
-
-    # An example for paged_kv_indices, paged_kv_indptr:
-    # request 1, page indices [0, 5, 8]
-    # request 2, page indices [1, 6, 7]
-    # request 3, page indices [3, 4]
-    # paged_kv_indices is a concatenation of page indices of all requests:
-    # [0, 5, 8, 1, 6, 7, 3, 4]
-    # paged_kv_indptr is used to index into paged_kv_indices:
-    # [0, 3, 6, 8]
-    # The indptr of the paged kv cache, shape: [batch_size + 1]
-    paged_kv_indptr: Optional[torch.Tensor] = None
-    # The page indices of the paged kv cache
-    paged_kv_indices: Optional[torch.Tensor] = None
-    # The number of entries in the last page of each request in
-    # the paged kv cache, shape: [batch_size]
-    paged_kv_last_page_len: Optional[torch.Tensor] = None
-    # The number of query/output heads
-    num_qo_heads: Optional[int] = None
-    # The number of key/value heads
-    num_kv_heads: Optional[int] = None
-    # The dimension of the attention heads
-    head_dim: Optional[int] = None
-    # Block size of vllm
-    page_size: Optional[int] = None
-    # The data type of the paged kv cache
-    data_type: torch.dtype = None
-    # The data type of the query
-    q_data_type: torch.dtype = None
-    # FlashInfer 0.2 encourages passing host tensors
-    device: torch.device = torch.device("cpu")
-    is_profile_run: bool = False
-
-    # The FlashInfer backend currently supports only models in which all layers
-    # share the same following hyperparameters:
-
-    # The left (inclusive) window size for the attention window, when
-    # set to `-1`, the window size will be set to the full length of
-    # the sequence. Defaults to `-1`.
-    window_left: int = -1
-    # The attention logits soft capping value (used in Gemini, Grok and
-    # Gemma-2, etc.), if not provided, will be set to `0`. If greater
-    # than 0, the logits will be capped according to formula:
-    # $$\texttt{logits\_soft\_cap} \times
-    # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$,
-    # where $x$ is the input logits.
-    logits_soft_cap: Optional[float] = None
-    # The scale used in softmax, if not provided, will be set to
-    # `1.0 / sqrt(head_dim)`.
-    sm_scale: Optional[float] = None
-
-    def __post_init__(self):
-        # Refer to
-        # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157
-        supported_head_sizes = FlashInferBackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f" received {self.head_dim}.")
-
-    def begin_forward(self):
-        if self.num_prefill_tokens > 0:
-            if self.paged_kv_indices is None:
-                return
-
-            assert self.prefill_wrapper is not None
-            assert self.query_start_loc is not None
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            assert self.block_table_bound is not None
-            assert self.seq_lens_tensor is not None
-            self.query_start_loc = self.query_start_loc[:self.num_prefills + 1]
-            batch_size = self.query_start_loc.shape[0] - 1
-            assert batch_size >= 0
-            # We will use flash attention for profiling to
-            # determine the number of blocks. Therefore,
-            # we don't need to prepare the input for flashinfer for profile run.
-            if not self.is_profile_run:
-                self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-                self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                    self.device)
-                self.block_table_bound = self.block_table_bound.to(self.device)
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-                self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-                self.prefill_wrapper.plan(
-                    self.query_start_loc,
-                    self.paged_kv_indptr[:self.num_prefills + 1],
-                    self.paged_kv_indices,
-                    self.paged_kv_last_page_len[:self.num_prefills],
-                    self.num_qo_heads,
-                    self.num_kv_heads,
-                    self.head_dim,
-                    self.page_size,
-                    causal=True,
-                    sm_scale=self.sm_scale,
-                    window_left=self.window_left,
-                    logits_soft_cap=self.logits_soft_cap,
-                    q_data_type=self.q_data_type,
-                    kv_data_type=self.data_type)
-        if self.num_decode_tokens > 0:
-            assert self.paged_kv_indices is not None
-            assert self.paged_kv_indptr is not None
-            assert self.paged_kv_last_page_len is not None
-            self.paged_kv_indices = self.paged_kv_indices.to(self.device)
-            self.paged_kv_indptr = self.paged_kv_indptr.to(self.device)
-            self.paged_kv_last_page_len = self.paged_kv_last_page_len.to(
-                self.device)
-            # handle model warmup path
-            if self.block_table_bound is not None:
-                self.block_table_bound = self.block_table_bound.to(self.device)
-            if self.seq_lens_tensor is not None:
-                self.seq_lens_tensor = self.seq_lens_tensor.to(self.device)
-
-            assert self.decode_wrapper is not None
-            self.decode_wrapper.plan(
-                self.paged_kv_indptr[self.num_prefills:],
-                self.paged_kv_indices,
-                self.paged_kv_last_page_len[self.num_prefills:],
-                self.num_qo_heads,
-                self.num_kv_heads,
-                self.head_dim,
-                self.page_size,
-                # Disable flashinfer's pos encoding and use vllm's rope.
-                pos_encoding_mode="NONE",
-                window_left=self.window_left,
-                logits_soft_cap=self.logits_soft_cap,
-                sm_scale=self.sm_scale,
-                # kv-cache data type.
-                kv_data_type=self.data_type,
-                # query data type.
-                q_data_type=self.q_data_type)
-
-    def asdict_zerocopy(self,
-                        skip_fields: Optional[Set[str]] = None
-                        ) -> Dict[str, Any]:
-        if skip_fields is None:
-            skip_fields = set()
-        # We need to skip the prefill/decode_wrapper field since it cannot be
-        # broadcasted with nccl when TP is enabled.
-        skip_fields.add('prefill_wrapper')
-        skip_fields.add('decode_wrapper')
-        return super().asdict_zerocopy(skip_fields)
-
-    @property
-    def prefill_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_prefills == 0:
-            return None
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["FlashInferMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-        return self
-
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-
-        if turn_prefills_into_decodes:
-            # When Multi-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            # Flashinfer doesn't support speculative decoding + chunked-prefill
-            # + multi-step scheduling yet.
-            assert self.decode_query_len == 1
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens_tensor is not None
-
-        assert num_seqs > 0
-        assert num_queries > 0
-        assert model_input.attn_metadata is not None
-        assert sampled_token_ids is not None
-
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        model_input.input_tokens[:num_queries] = sampled_token_ids.flatten()
-
-        # Update GPU tensors
-        ops.advance_step_flashinfer(
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            block_size=block_size,
-            input_tokens=model_input.input_tokens,
-            sampled_token_ids=model_input.input_tokens,
-            input_positions=model_input.input_positions,
-            seq_lens=self.seq_lens_tensor,
-            slot_mapping=self.slot_mapping,
-            block_tables=self.block_tables,
-            paged_kv_indices=self.paged_kv_indices,
-            paged_kv_indptr=self.paged_kv_indptr,
-            paged_kv_last_page_len=self.paged_kv_last_page_len,
-            block_table_bound=self.block_table_bound)
-
-
-class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-        # Global hyperparameters shared by all attention layers
-        self.global_hyperparameters: Optional[PerLayerParameters] = None
-
-        self.vllm_config = self.runner.vllm_config
-
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-
-        # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout
-        # for the precise definition of the following fields.
-        # An example:
-        # request 1, page indices [0, 5, 8]
-        # request 2, page indices [1, 6, 7]
-        # request 3, page indices [3, 4]
-        # paged_kv_indices is a concatenation of page indices of all requests:
-        # [0, 5, 8, 1, 6, 7, 3, 4]
-        # paged_kv_indptr is used to index into paged_kv_indices:
-        # [0, 3, 6, 8]
-        self.paged_kv_indices: List[int] = []
-        # 0 at the beginning of paged_kv_indptr indicates the start of the
-        # first request’s page indices in the paged_kv_indices list.
-        self.paged_kv_indptr: List[int] = [0]
-        # paged_kv_last_page_len is the length of the last page of each request
-        self.paged_kv_last_page_len: List[int] = []
-        self.total_blocks = 0
-        self.is_profile_run: bool = False
-
-        if self.global_hyperparameters is None:
-            # Infer global hyperparameters, since currently we only support
-            # models in which all layers share the same values for the
-            # following hyperparameters:
-            # - `window_left`
-            # - `logits_soft_cap`
-            # - `sm_scale`
-            inferred_params = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config))
-            self.global_hyperparameters = inferred_params
-            self.window_left = inferred_params.window_left
-            self.logits_soft_cap = inferred_params.logits_soft_cap
-            self.sm_scale = inferred_params.sm_scale
-
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-        computed_block_nums = inter_data.computed_block_nums
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks):
-            self.context_lens.append(context_len)
-            if is_prompt:
-                mm_maps = inter_data.multi_modal_placeholder_maps
-                if mm_maps:
-                    for modality, placeholders in mm_maps.items():
-                        self.multimodal_placeholder_maps[modality].extend(
-                            placeholders)
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                assert query_len == 1, (
-                    "seq_len: {}, context_len: {}, query_len: {}".format(
-                        seq_len, context_len, query_len))
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if inter_data.prefix_cache_hit:
-                block_table = computed_block_nums
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                block_table = block_tables[seq_id][-curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            is_profile_run = is_block_tables_empty(block_tables)
-
-            # Compute slot mapping.
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-
-            # It is not necessary to add paged_kv_indices, paged_kv_indptr,
-            # and paged_kv_last_page_len for profile run because we will
-            # create dummy inputs.
-            if is_profile_run:
-                self.is_profile_run = is_profile_run
-                return
-
-            block_table = block_tables[seq_id]
-            self._update_paged_kv_tensors(block_table, seq_len)
-
-    def _update_paged_kv_tensors(self, block_table: List[int], seq_len: int):
-        # Get the number of valid blocks based on sequence length.
-        # If seq_len = 16, block_size = 16,
-        # block_table_bound is 1 with 1 valid block.
-        # If seq_len = 15, block_size = 16,
-        # block_table_bound is 0 + 1 with 1 valid block.
-        self.total_blocks += len(block_table)
-        block_table_bound = seq_len // self.block_size + 1 \
-                            if seq_len % self.block_size != 0 \
-                            else seq_len // self.block_size
-        self.paged_kv_indices.extend(block_table[:block_table_bound])
-        self.paged_kv_indptr.append(self.paged_kv_indptr[-1] +
-                                    block_table_bound)
-
-        last_page_len = seq_len % self.block_size
-        if last_page_len == 0:
-            last_page_len = self.block_size
-        self.paged_kv_last_page_len.append(last_page_len)
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
-
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        decode_query_len = max(query_lens[self.num_prefills:], default=1)
-
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-
-            # The shape of graph_block_tables is
-            # [max batch size, max context len // block size].
-            input_block_tables = self.runner.graph_block_tables[:batch_size]
-            max_blocks = input_block_tables.shape[1]
-            for i, block_table in enumerate(self.block_tables):
-                if block_table:
-                    num_blocks = len(block_table)
-                    if num_blocks <= max_blocks:
-                        input_block_tables[i, :num_blocks] = block_table
-                    else:
-                        # It may be possible to have more blocks allocated due
-                        # to lookahead slots of multi-step, however, they are
-                        # not used anyway, so can be safely ignored.
-                        input_block_tables[
-                            i, :max_blocks] = block_table[:max_blocks]
-
-            block_tables = torch.from_numpy(input_block_tables).to(
-                device, non_blocking=True)
-
-            last_paged_kv_indptr = self.paged_kv_indptr[-1]
-            self.paged_kv_indptr.extend([last_paged_kv_indptr] *
-                                        cuda_graph_pad_size)
-            self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-
-        assert device is not None
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device,
-                                             self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
-
-        if len(self.paged_kv_indptr) > 0:
-            # extend to the maximum number of blocks as returned by the
-            # scheduler
-            self.paged_kv_indices.extend(
-                [0] * (self.total_blocks - len(self.paged_kv_indices)))
-            paged_kv_indices_tensor = torch.tensor(self.paged_kv_indices,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-            paged_kv_indptr_tensor = torch.tensor(self.paged_kv_indptr,
-                                                  device="cpu",
-                                                  dtype=torch.int)
-            paged_kv_last_page_len_tensor = torch.tensor(
-                self.paged_kv_last_page_len, device="cpu", dtype=torch.int)
-            block_table_bound_tensor = torch.zeros(len(self.paged_kv_indptr) -
-                                                   1,
-                                                   device="cpu",
-                                                   dtype=torch.int)
-        else:
-            paged_kv_indices_tensor = None
-            paged_kv_indptr_tensor = None
-            paged_kv_last_page_len_tensor = None
-            block_table_bound_tensor = None
-
-        if self.runner.kv_cache_dtype.startswith("fp8"):
-            kv_cache_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                self.runner.kv_cache_dtype)
-        else:
-            kv_cache_dtype = get_kv_cache_torch_dtype(
-                self.runner.kv_cache_dtype, self.runner.model_config.dtype)
-
-        return FlashInferMetadata(
-            decode_query_len=decode_query_len,
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            block_tables=block_tables,
-            paged_kv_indptr=paged_kv_indptr_tensor,
-            paged_kv_indices=paged_kv_indices_tensor,
-            paged_kv_last_page_len=paged_kv_last_page_len_tensor,
-            block_table_bound=block_table_bound_tensor,
-            seq_lens_tensor=seq_lens_tensor,
-            num_qo_heads=self.runner.model_config.get_num_attention_heads(
-                self.runner.parallel_config),
-            num_kv_heads=self.runner.model_config.get_num_kv_heads(
-                self.runner.parallel_config),
-            head_dim=self.runner.model_config.get_head_size(),
-            page_size=self.block_size,
-            seq_start_loc=seq_start_loc,
-            query_start_loc=query_start_loc,
-            device=device,
-            data_type=kv_cache_dtype,
-            q_data_type=self.runner.model_config.dtype,
-            use_cuda_graph=use_captured_graph,
-            is_profile_run=self.is_profile_run,
-            window_left=self.window_left,
-            logits_soft_cap=self.logits_soft_cap,
-            sm_scale=self.sm_scale,
-        )
-
-
-class FlashInferImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "FLASHINFER backend.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in FlashInfer is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = ((sliding_window - 1,
-                                0) if sliding_window is not None else (-1, -1))
-        self.kv_cache_dtype = kv_cache_dtype
-        self.logits_soft_cap = logits_soft_cap
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "FlashInferImpl")
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: FlashInferMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for FlashInferImpl")
-
-        # TODO: directly write to output tensor
-        num_heads: int = self.num_heads
-        head_size: int = self.head_size
-        num_kv_heads: int = self.num_kv_heads
-        kv_cache_dtype: str = self.kv_cache_dtype
-        softmax_scale: float = self.scale
-        window_size = self.sliding_window
-        alibi_slopes = self.alibi_slopes
-        logits_soft_cap = self.logits_soft_cap
-
-        num_tokens, hidden_size = query.shape
-        query = query.view(-1, num_heads, head_size)
-        key = key.view(-1, num_kv_heads, head_size)
-        value = value.view(-1, num_kv_heads, head_size)
-
-        if kv_cache.numel() > 0:
-            # Use the same reshape and cache kernel as flash attention.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                kv_cache[:, 0],
-                kv_cache[:, 1],
-                attn_metadata.slot_mapping.flatten(),
-                kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-            # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2
-            # to process the cache when the kv_cache_dtype is fp8
-            if kv_cache_dtype.startswith("fp8"):
-                torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer(
-                    kv_cache_dtype)
-                kv_cache = kv_cache.view(torch_dtype)
-
-        num_prefill_tokens = attn_metadata.num_prefill_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa
-        assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \
-                    f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa
-        query = query.contiguous(
-        )  # Flashinfer requires query to be contiguous
-        # Query for decode. KV is not needed because it is already cached.
-        # QKV for prefill.
-        decode_query = query[num_prefill_tokens:]
-        query = query[:num_prefill_tokens]
-
-        key = key[:num_prefill_tokens]
-        value = value[:num_prefill_tokens]
-
-        assert query.shape[0] == num_prefill_tokens
-        assert decode_query.shape[0] == num_decode_tokens
-
-        window_left = window_size[0] if window_size is not None else -1
-
-        prefill_output: Optional[torch.Tensor] = None
-        decode_output: Optional[torch.Tensor] = None
-        stride_order = FlashInferBackend.get_kv_cache_stride_order()
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # We will use flash attention for prefill
-            # when kv_cache is not provided.
-            # This happens when vllm runs the profiling to
-            # determine the number of blocks.
-            if kv_cache.numel() == 0:
-                prefill_output = flash_attn_varlen_func(
-                    q=query,
-                    k=key,
-                    v=value,
-                    cu_seqlens_q=prefill_meta.seq_start_loc,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_q=prefill_meta.max_prefill_seq_len,
-                    max_seqlen_k=prefill_meta.max_prefill_seq_len,
-                    softmax_scale=softmax_scale,
-                    causal=True,
-                    window_size=window_size,
-                    alibi_slopes=alibi_slopes,
-                )
-            else:
-                assert prefill_meta is not None
-                assert prefill_meta.prefill_wrapper is not None
-
-                assert prefill_meta.prefill_wrapper._causal
-                assert prefill_meta.prefill_wrapper._window_left == window_left
-                assert prefill_meta.prefill_wrapper._logits_soft_cap == (
-                    logits_soft_cap or 0.0)
-                assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale
-
-                prefill_output = prefill_meta.prefill_wrapper.run(
-                    query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                )
-        if decode_meta := attn_metadata.decode_metadata:
-            assert decode_meta is not None
-            assert decode_meta.decode_wrapper is not None
-
-            assert decode_meta.decode_wrapper._window_left == window_left
-            assert decode_meta.decode_wrapper._logits_soft_cap == (
-                logits_soft_cap or 0.0)
-            assert decode_meta.decode_wrapper._sm_scale == softmax_scale
-            # TODO: @pavanimajety Remove this once the switch happens
-            # inside flashinfer.
-            if not FlashInferBackend.use_trtllm_decode_attention(
-                    num_decode_tokens, attn_metadata.max_decode_seq_len,
-                    kv_cache_dtype, attn_metadata.num_qo_heads,
-                    attn_metadata.num_kv_heads, attn_metadata.head_dim):
-                decode_output = decode_meta.decode_wrapper.run(
-                    decode_query,
-                    kv_cache.permute(*stride_order),
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                )
-            else:
-                workspace_buffer = (
-                    decode_meta.decode_wrapper._int_workspace_buffer)
-                assert FlashInferState.get_kv_cache_layout() == "HND"
-                decode_output = trtllm_batch_decode_with_kv_cache(
-                    query=decode_query,
-                    kv_cache=kv_cache.permute(*stride_order),
-                    workspace_buffer=workspace_buffer,
-                    block_tables=attn_metadata.block_tables,
-                    seq_lens=decode_meta.seq_lens_tensor,
-                    max_seq_len=attn_metadata.max_decode_seq_len,
-                    bmm1_scale=layer._k_scale_float * softmax_scale,
-                    bmm2_scale=layer._v_scale_float,
-                )
-
-        if prefill_output is None and decode_output is not None:
-            # Decode only batch.
-            output, num_tokens = decode_output, num_decode_tokens
-        elif decode_output is None and prefill_output is not None:
-            # Prefill only batch.
-            output, num_tokens = prefill_output, num_prefill_tokens
-        else:
-            # Chunked prefill batch does not work with speculative decoding in
-            # FlashInfer backend, so the query length for decode should be 1.
-            assert prefill_output is not None
-            assert decode_output is not None
-            assert decode_meta is not None
-            assert decode_meta.decode_query_len == 1
-            decode_output = decode_output.squeeze(1)
-            output = torch.cat([prefill_output, decode_output], dim=0)
-        return output.view(num_tokens, hidden_size)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
deleted file mode 100644
index 0bc38b414290..000000000000
--- a/vllm/attention/backends/xformers.py
+++ /dev/null
@@ -1,804 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with xFormers and PagedAttention."""
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Type
-
-import torch
-from xformers import ops as xops
-from xformers.ops.fmha.attn_bias import (AttentionBias,
-                                         BlockDiagonalCausalMask,
-                                         BlockDiagonalMask,
-                                         LowerTriangularMaskWithTensorBias)
-
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import (
-    CommonAttentionState, CommonMetadataBuilder,
-    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
-    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-class XFormersBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> Type["XFormersImpl"]:
-        return XFormersImpl
-
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return XFormersMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["XFormersMetadataBuilder"]:
-        return XFormersMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return PagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                 num_kv_heads, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
-
-
-@dataclass
-class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata):
-    """Metadata for XFormersbackend.
-
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
-    #                                   |-- query_len ---|
-
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-
-    # FIXME: It is for flash attn.
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-    use_cuda_graph: bool
-
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]] = None
-
-    # FIXME: It is for flash attn.
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor] = None
-
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor] = None
-
-    # Maximum query length in the batch. None for decoding.
-    max_query_len: Optional[int] = None
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int] = None
-
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersMetadata"] = None
-
-    # Begin encoder attn & enc/dec cross-attn fields...
-
-    # Encoder sequence lengths representation
-    encoder_seq_lens: Optional[List[int]] = None
-    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
-    # FIXME: It is for flash attn.
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    encoder_seq_start_loc: Optional[torch.Tensor] = None
-
-    # Maximum sequence length among encoder sequences
-    max_encoder_seq_len: Optional[int] = None
-
-    # Number of tokens input to encoder
-    num_encoder_tokens: Optional[int] = None
-
-    # Cross-attention memory-mapping data structures: slot mapping
-    # and block tables
-    cross_slot_mapping: Optional[torch.Tensor] = None
-    cross_block_tables: Optional[torch.Tensor] = None
-
-    def __post_init__(self):
-        # Set during the execution of the first attention op.
-        # It is a list because it is needed to set per prompt
-        # when alibi slopes is used. It is because of the limitation
-        # from xformer API.
-        # will not appear in the __repr__ and __init__
-        self.attn_bias: Optional[List[AttentionBias]] = None
-        self.encoder_attn_bias: Optional[List[AttentionBias]] = None
-        self.cross_attn_bias: Optional[List[AttentionBias]] = None
-
-    @property
-    def is_all_encoder_attn_metadata_set(self):
-        '''
-        All attention metadata required for encoder attention is set.
-        '''
-        return is_all_encoder_attn_metadata_set(self)
-
-    @property
-    def is_all_cross_attn_metadata_set(self):
-        '''
-        All attention metadata required for enc/dec cross-attention is set.
-
-        Superset of encoder attention required metadata.
-        '''
-        return is_all_cross_attn_metadata_set(self)
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        assert ((self.seq_lens is not None)
-                or (self.encoder_seq_lens is not None))
-        assert ((self.seq_lens_tensor is not None)
-                or (self.encoder_seq_lens_tensor is not None))
-
-        # Compute some attn_metadata fields which default to None
-        query_start_loc = (None if self.query_start_loc is None else
-                           self.query_start_loc[:self.num_prefills + 1])
-        seq_start_loc = (None if self.seq_start_loc is None else
-                         self.seq_start_loc[:self.num_prefills + 1])
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[:self.num_prefill_tokens])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[:self.num_prefills])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[:self.num_prefills])
-        context_lens_tensor = (None if self.context_lens_tensor is None else
-                               self.context_lens_tensor[:self.num_prefills])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[:self.num_prefills])
-
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersMetadata(
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_seq_len=0,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=False,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-        assert ((self.seq_lens_tensor is not None)
-                or (self.encoder_seq_lens_tensor is not None))
-
-        # Compute some attn_metadata fields which default to None
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[self.num_prefill_tokens:])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[self.num_prefills:])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[self.num_prefills:])
-
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            seq_lens_tensor=seq_lens_tensor,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            block_tables=block_tables,
-            use_cuda_graph=self.use_cuda_graph,
-            # Begin encoder & cross attn fields below...
-            encoder_seq_lens=self.encoder_seq_lens,
-            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
-            max_encoder_seq_len=self.max_encoder_seq_len,
-            cross_slot_mapping=self.cross_slot_mapping,
-            cross_block_tables=self.cross_block_tables)
-
-        # Batch may be composed of prefill|decodes, adjust query start indices
-        # to refer to the start of decodes when the two are split apart.
-        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-        if self._cached_decode_metadata.query_start_loc is not None:
-            qs = self._cached_decode_metadata.query_start_loc
-            self._cached_decode_metadata.query_start_loc = qs - qs[0]
-        return self._cached_decode_metadata
-
-
-def _get_attn_bias(
-    attn_metadata: XFormersMetadata,
-    attn_type: str,
-) -> Optional[AttentionBias]:
-    '''
-    Extract appropriate attention bias from attention metadata
-    according to attention type.
-
-    Arguments:
-
-    * attn_metadata: Attention metadata structure associated with attention
-    * attn_type: encoder attention, decoder self-attention,
-                 encoder/decoder cross-attention
-
-    Returns:
-    * Appropriate attention bias value given the attention type
-    '''
-
-    if (attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY):
-        return attn_metadata.attn_bias
-    elif attn_type == AttentionType.ENCODER:
-        return attn_metadata.encoder_attn_bias
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        return attn_metadata.cross_attn_bias
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-def _set_attn_bias(
-    attn_metadata: XFormersMetadata,
-    attn_bias: List[Optional[AttentionBias]],
-    attn_type: str,
-) -> None:
-    '''
-    Update appropriate attention bias field of attention metadata,
-    according to attention type.
-
-    Arguments:
-
-    * attn_metadata: Attention metadata structure associated with attention
-    * attn_bias: The desired attention bias value
-    * attn_type: encoder attention, decoder self-attention,
-                 encoder/decoder cross-attention
-    '''
-
-    if (attn_type == AttentionType.DECODER
-            or attn_type == AttentionType.ENCODER_ONLY):
-        attn_metadata.attn_bias = attn_bias
-    elif attn_type == AttentionType.ENCODER:
-        attn_metadata.encoder_attn_bias = attn_bias
-    elif attn_type == AttentionType.ENCODER_DECODER:
-        attn_metadata.cross_attn_bias = attn_bias
-    else:
-        raise AttributeError(f"Invalid attention type {str(attn_type)}")
-
-
-class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]):
-
-    _metadata_cls = XFormersMetadata
-
-
-class XFormersImpl(AttentionImpl[XFormersMetadata]):
-    """
-    If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prefill_tokens ----------------->|	
-    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
-
-    Otherwise, the layout is as follows:	
-    |<----------------- num_decode_tokens ------------------>|	
-    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
-
-    Generation tokens can contain padding when cuda-graph is used.
-    Currently, prompt tokens don't contain any padding.
-
-    The prompts might have different lengths, while the generation tokens
-    always have length 1.
-
-    If chunked prefill is enabled, prefill tokens and decode tokens can be
-    batched together in a flattened 1D query.
-
-    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
-    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
-
-    Currently, cuda graph is disabled for chunked prefill, meaning there's no
-    padding between prefill and decode tokens.
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "XFORMERS backend.")
-        if logits_soft_cap is not None:
-            logger.warning_once("XFormers does not support logits soft cap. "
-                                "Outputs may be slightly off.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in XFormers is not supported yet, it will fall"
-                " back to global attention for long context.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        self.sliding_window = sliding_window
-        self.kv_cache_dtype = kv_cache_dtype
-
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-
-        supported_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-
-        self.attn_type = attn_type
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor],
-        value: Optional[torch.Tensor],
-        kv_cache: torch.Tensor,
-        attn_metadata: "XFormersMetadata",
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with xFormers and PagedAttention.
-
-        For decoder-only models: query, key and value must be non-None.
-
-        For encoder/decoder models:
-        * XFormersImpl.forward() may be invoked for both self- and cross-
-          attention layers.
-        * For self-attention: query, key and value must be non-None.
-        * For cross-attention:
-            * Query must be non-None
-            * During prefill, key and value must be non-None; key and value
-              get cached for use during decode.
-            * During decode, key and value may be None, since:
-              (1) key and value tensors were cached during prefill, and
-              (2) cross-attention key and value tensors do not grow during
-                  decode
-        
-        A note on how the attn_type (attention type enum) argument impacts
-        attention forward() behavior:
-    
-            * DECODER: normal decoder-only behavior;
-                use decoder self-attention block table
-            * ENCODER: no KV caching; pass encoder sequence
-                attributes (encoder_seq_lens/encoder_seq_lens_tensor/
-                max_encoder_seq_len) to kernel, in lieu of decoder
-                sequence attributes (seq_lens/seq_lens_tensor/max_seq_len).
-                Used for encoder branch of encoder-decoder models.
-            * ENCODER_ONLY: no kv_caching, uses the normal attention 
-                attributes (seq_lens/seq_lens_tensor/max_seq_len).
-            * ENCODER_DECODER: cross-attention behavior;
-                use cross-attention block table for caching KVs derived
-                from encoder hidden states; since KV sequence lengths
-                will match encoder sequence lengths, pass encoder sequence
-                attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
-                max_encoder_seq_len)
-    
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-                NOTE: kv_cache will be an empty tensor with shape [0]
-                for profiling run.
-            attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersImpl")
-
-        attn_type = self.attn_type
-        # Check that appropriate attention metadata attributes are
-        # selected for the desired attention type
-        if (attn_type == AttentionType.ENCODER
-                and (not attn_metadata.is_all_encoder_attn_metadata_set)):
-            raise AttributeError("Encoder attention requires setting "
-                                 "encoder metadata attributes.")
-
-        elif (attn_type == AttentionType.ENCODER_DECODER
-              and (not attn_metadata.is_all_cross_attn_metadata_set)):
-            raise AttributeError("Encoder/decoder cross-attention "
-                                 "requires setting cross-attention "
-                                 "metadata attributes.")
-
-        query = query.view(-1, self.num_heads, self.head_size)
-        if key is not None:
-            assert value is not None
-            key = key.view(-1, self.num_kv_heads, self.head_size)
-            value = value.view(-1, self.num_kv_heads, self.head_size)
-        else:
-            assert value is None
-
-        # Self-attention vs. cross-attention will impact
-        # which KV cache memory-mapping & which
-        # seqlen datastructures we utilize
-
-        if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0):
-            # KV-cache during decoder-self- or
-            # encoder-decoder-cross-attention, but not
-            # during encoder attention.
-            #
-            # Even if there are no new key/value pairs to cache,
-            # we still need to break out key_cache and value_cache
-            # i.e. for later use by paged attention
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-
-            if (key is not None) and (value is not None):
-
-                if attn_type == AttentionType.ENCODER_DECODER:
-                    # Update cross-attention KV cache (prefill-only)
-                    # During cross-attention decode, key & value will be None,
-                    # preventing this IF-statement branch from running
-                    updated_slot_mapping = attn_metadata.cross_slot_mapping
-                else:
-                    # Update self-attention KV cache (prefill/decode)
-                    updated_slot_mapping = attn_metadata.slot_mapping
-
-                # Reshape the input keys and values and store them in the cache.
-                # If kv_cache is not provided, the new key and value tensors are
-                # not cached. This happens during the initial memory
-                # profiling run.
-                PagedAttention.write_to_paged_cache(
-                    key, value, key_cache, value_cache, updated_slot_mapping,
-                    self.kv_cache_dtype, layer._k_scale, layer._v_scale)
-        (num_prefill_query_tokens, num_prefill_kv_tokens,
-        num_decode_query_tokens) = \
-            get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
-
-        output = torch.empty_like(query)
-        # Query for decode. KV is not needed because it is already cached.
-        decode_query = query[num_prefill_query_tokens:]
-        # QKV for prefill.
-        query = query[:num_prefill_query_tokens]
-        if key is not None and value is not None:
-            key = key[:num_prefill_kv_tokens]
-            value = value[:num_prefill_kv_tokens]
-
-        assert query.shape[0] == num_prefill_query_tokens
-        assert decode_query.shape[0] == num_decode_query_tokens
-
-        if prefill_meta := attn_metadata.prefill_metadata:
-            # Prompt run.
-            if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0:
-                # normal attention.
-                # block tables are empty if the prompt does not have a cached
-                # prefix.
-                out = self._run_memory_efficient_xformers_forward(
-                    query, key, value, prefill_meta, attn_type=attn_type)
-                assert out.shape == output[:num_prefill_query_tokens].shape
-                output[:num_prefill_query_tokens] = out
-            else:
-                assert attn_type != AttentionType.ENCODER_ONLY, (
-                    "Encoder-only models should not have prefix attention.")
-
-                assert prefill_meta.query_start_loc is not None
-                assert prefill_meta.max_query_len is not None
-
-                # prefix-enabled attention
-                # TODO(Hai) this triton kernel has regression issue (broke) to
-                # deal with different data types between KV and FP8 KV cache,
-                # to be addressed separately.
-                out = PagedAttention.forward_prefix(
-                    query,
-                    key,
-                    value,
-                    self.kv_cache_dtype,
-                    key_cache,
-                    value_cache,
-                    prefill_meta.block_tables,
-                    prefill_meta.query_start_loc,
-                    prefill_meta.seq_lens_tensor,
-                    prefill_meta.max_query_len,
-                    self.alibi_slopes,
-                    self.sliding_window,
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-                assert output[:num_prefill_query_tokens].shape == out.shape
-                output[:num_prefill_query_tokens] = out
-
-        if decode_meta := attn_metadata.decode_metadata:
-            assert attn_type != AttentionType.ENCODER_ONLY, (
-                "Encoder-only models should not have decode metadata.")
-
-            (
-                seq_lens_arg,
-                max_seq_len_arg,
-                block_tables_arg,
-            ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
-
-            output[num_prefill_query_tokens:] = PagedAttention.forward_decode(
-                decode_query,
-                key_cache,
-                value_cache,
-                block_tables_arg,
-                seq_lens_arg,
-                max_seq_len_arg,
-                self.kv_cache_dtype,
-                self.num_kv_heads,
-                self.scale,
-                self.alibi_slopes,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        # Reshape the output tensor.
-        return output.view(-1, self.num_heads * self.head_size)
-
-    def _run_memory_efficient_xformers_forward(
-        self,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        attn_metadata: XFormersMetadata,
-        attn_type: str = AttentionType.DECODER,
-    ) -> torch.Tensor:
-        """Attention for 1D query of multiple prompts. Multiple prompt
-        tokens are flattened in to `query` input.
-
-        See https://facebookresearch.github.io/xformers/components/ops.html
-        for API spec.
-
-        Args:
-            output: shape = [num_prefill_tokens, num_heads, head_size]
-            query: shape = [num_prefill_tokens, num_heads, head_size]
-            key: shape = [num_prefill_tokens, num_kv_heads, head_size]
-            value: shape = [num_prefill_tokens, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-            attn_type: Select attention type, between encoder attention,
-                       decoder self-attention, or encoder/decoder cross-
-                       attention. Defaults to decoder self-attention,
-                       which is the vLLM default generally
-        """
-
-        original_query = query
-        if self.num_kv_heads != self.num_heads:
-            # GQA/MQA requires the shape [B, M, G, H, K].
-            # Note that the output also has the same shape (which is different
-            # from a spec from the doc).
-            query = query.view(query.shape[0], self.num_kv_heads,
-                               self.num_queries_per_kv, query.shape[-1])
-            key = key[:, :,
-                      None, :].expand(key.shape[0], self.num_kv_heads,
-                                      self.num_queries_per_kv, key.shape[-1])
-            value = value[:, :,
-                          None, :].expand(value.shape[0], self.num_kv_heads,
-                                          self.num_queries_per_kv,
-                                          value.shape[-1])
-
-        # Set attention bias if not provided. This typically happens at
-        # the very attention layer of every iteration.
-        # FIXME(woosuk): This is a hack.
-        attn_bias = _get_attn_bias(attn_metadata, attn_type)
-        if attn_bias is None:
-            if self.alibi_slopes is None:
-
-                # Cross attention block of decoder branch of encoder-decoder
-                # model uses seq_lens for dec / encoder_seq_lens for enc
-                if (attn_type == AttentionType.ENCODER_DECODER):
-                    assert attn_metadata.seq_lens is not None
-                    assert attn_metadata.encoder_seq_lens is not None
-
-                    # Cross-attention mask is non-causal
-                    attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.seq_lens,
-                        attn_metadata.encoder_seq_lens,
-                        device=query.device)
-
-                # Encoder branch of encoder-decoder model uses
-                # attn_metadata.encoder_seq_lens
-                elif attn_type == AttentionType.ENCODER:
-
-                    assert attn_metadata.encoder_seq_lens is not None
-
-                    # Encoder self-attention mask is non-causal
-                    attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.encoder_seq_lens, device=query.device)
-
-                # Self-attention block of encoder-only model just
-                # uses the seq_lens directly.
-                elif attn_type == AttentionType.ENCODER_ONLY:
-                    assert attn_metadata.seq_lens is not None
-
-                    # Encoder self-attention mask is non-causal
-                    attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.seq_lens, device=query.device)
-
-                # Self-attention block of decoder branch just
-                # uses the seq_lens directly
-                elif attn_type == AttentionType.DECODER:
-                    assert attn_metadata.seq_lens is not None
-
-                    # Decoder self-attention mask is causal
-                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                        attn_metadata.seq_lens, device=query.device)
-                else:
-                    raise ValueError("Unknown AttentionType: %s", attn_type)
-
-                if self.sliding_window is not None:
-                    attn_bias = attn_bias.make_local_attention(
-                        self.sliding_window)
-                attn_bias = [attn_bias]
-            else:
-                assert attn_type == AttentionType.DECODER
-                assert attn_metadata.seq_lens is not None
-                attn_bias = _make_alibi_bias(self.alibi_slopes,
-                                             self.num_kv_heads, query.dtype,
-                                             attn_metadata.seq_lens)
-
-            _set_attn_bias(attn_metadata, attn_bias, attn_type)
-
-        # No alibi slopes.
-        # TODO(woosuk): Too many view operations. Let's try to reduce
-        # them in the future for code readability.
-        if self.alibi_slopes is None:
-            # Add the batch dimension.
-            query = query.unsqueeze(0)
-            key = key.unsqueeze(0)
-            value = value.unsqueeze(0)
-            out = xops.memory_efficient_attention_forward(
-                query,
-                key,
-                value,
-                attn_bias=attn_bias[0],
-                p=0.0,
-                scale=self.scale)
-            return out.view_as(original_query)
-
-        # Attention with alibi slopes.
-        # FIXME(woosuk): Because xformers does not support dynamic sequence
-        # lengths with custom attention bias, we process each prompt one by
-        # one. This is inefficient, especially when we have many short prompts.
-        assert attn_metadata.seq_lens is not None
-        output = torch.empty_like(original_query)
-        start = 0
-        for i, seq_len in enumerate(attn_metadata.seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=self.scale)
-            # TODO(woosuk): Unnecessary copy. Optimize.
-            output[start:end].copy_(out.view_as(original_query[start:end]))
-            start += seq_len
-        return output
-
-
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: List[int],
-) -> List[AttentionBias]:
-    attn_biases: List[AttentionBias] = []
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(seq_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        # Calculate a matrix where each element represents ith element- jth
-        # element.
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        num_heads = alibi_slopes.shape[0]
-        bias = torch.empty(
-            1,  # batch size
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
-
-    return attn_biases
diff --git a/vllm/core/__init__.py b/vllm/core/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vllm/core/block/__init__.py b/vllm/core/block/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
deleted file mode 100644
index 444bb25f2830..000000000000
--- a/vllm/core/block/block_table.py
+++ /dev/null
@@ -1,399 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import math
-from typing import List, Optional
-
-from vllm.core.block.common import BlockList
-from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
-from vllm.utils import Device, cdiv, chunk_list
-
-
-class BlockTable:
-    """A class to manage blocks for a specific sequence.
-
-    The BlockTable maps a sequence of tokens to a list of blocks, where each
-    block represents a contiguous memory allocation for a portion of the 
-    sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
-    responsible for allocating and freeing memory for the blocks.
-
-    Args:
-        block_size (int): The maximum number of tokens that can be stored in a
-            single block.
-        block_allocator (DeviceAwareBlockAllocator): The block allocator used to
-            manage memory for the blocks.
-        _blocks (Optional[List[Block]], optional): An optional list of existing
-            blocks to initialize the BlockTable with. If not provided, an empty
-            BlockTable is created.
-        max_block_sliding_window (Optional[int], optional): The number of
-            blocks to keep around for each sequence. If None, all blocks
-            are kept (eg., when sliding window is not used).
-            It should at least fit the sliding window size of the model.
-
-    Attributes:
-        _block_size (int): The maximum number of tokens that can be stored in a
-            single block.
-        _allocator (DeviceAwareBlockAllocator): The block allocator used to
-            manage memory for the blocks.
-        _blocks (Optional[List[Block]]): The list of blocks managed by this
-            BlockTable.
-        _num_full_slots (int): The number of tokens currently stored in the
-            blocks.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        block_allocator: DeviceAwareBlockAllocator,
-        _blocks: Optional[List[Block]] = None,
-        max_block_sliding_window: Optional[int] = None,
-    ):
-        self._block_size = block_size
-        self._allocator = block_allocator
-        if _blocks is None:
-            _blocks = []
-        self._blocks: BlockList = BlockList(_blocks)
-
-        self._max_block_sliding_window = max_block_sliding_window
-        self._num_full_slots = self._get_num_token_ids()
-
-    @staticmethod
-    def get_num_required_blocks(token_ids: List[int],
-                                block_size: int,
-                                num_lookahead_slots: int = 0) -> int:
-        """Calculates the minimum number of blocks required to store a given
-        sequence of token IDs along with any look-ahead slots that may be
-        required (like in multi-step + chunked-prefill).
-
-        This assumes worst-case scenario, where every block requires a new
-        allocation (e.g. ignoring prefix caching).
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be stored.
-            block_size (int): The maximum number of tokens that can be stored in
-                a single block.
-            num_lookahead_slots (int): look-ahead slots that the sequence may
-                require.
-
-        Returns:
-            int: The minimum number of blocks required to store the given
-                sequence of token IDs along with any required look-ahead slots.
-        """
-        return cdiv(len(token_ids) + num_lookahead_slots, block_size)
-
-    def allocate(self,
-                 token_ids: List[int],
-                 device: Device = Device.GPU,
-                 extra_hash: Optional[int] = None) -> None:
-        """Allocates memory blocks for storing the given sequence of token IDs.
-
-        This method allocates the required number of blocks to store the given
-        sequence of token IDs.
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be stored.
-            device (Device, optional): The device on which the blocks should be
-                allocated. Defaults to Device.GPU.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefixcaching block.
-        """
-        assert not self._is_allocated
-        assert token_ids
-        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                     token_ids=token_ids,
-                                                     device=device,
-                                                     extra_hash=extra_hash)
-        self.update(blocks)
-        self._num_full_slots = len(token_ids)
-
-    def update(self, blocks: List[Block]) -> None:
-        """Resets the table to the newly provided blocks 
-        (with their corresponding block ids)
-        """
-        self._blocks.update(blocks)
-
-    def append_token_ids(self,
-                         token_ids: List[int],
-                         num_lookahead_slots: int = 0,
-                         num_computed_slots: Optional[int] = None,
-                         extra_hash: Optional[int] = None) -> None:
-        """Appends a sequence of token IDs to the existing blocks in the
-        BlockTable.
-
-        This method appends the given sequence of token IDs to the existing
-        blocks in the BlockTable. If there is not enough space in the existing
-        blocks, new blocks are allocated using the `ensure_num_empty_slots`
-        method to accommodate the additional tokens.
-
-        The token IDs are divided into chunks of size `block_size` (except for
-        the first chunk, which may be smaller), and each chunk is appended to a
-        separate block.
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be appended.
-            num_computed_slots (Optional[int]): The number of KV cache slots
-                that are already filled (computed).
-                When sliding window is enabled, this is used to compute how many
-                blocks to drop at the front of the sequence.
-                Without sliding window, None can be passed.
-                Without chunked prefill, it should be the same as
-                _num_full_slots.
-            extra_hash (Optional[int]): The hash value of additional
-                factors such as adapters that influence the block, apart
-                from the token_ids.
-        """
-        assert self._is_allocated, "no blocks have been allocated"
-        assert len(self._blocks) > 0
-
-        # Drop blocks that are no longer needed due to sliding window
-        if self._max_block_sliding_window is not None:
-            null_block = self._allocator.allocate_or_get_null_block()
-            assert num_computed_slots is not None
-            end_block_idx = (num_computed_slots //
-                             self._block_size) - self._max_block_sliding_window
-            for idx in range(0, end_block_idx):
-                b = self._blocks[idx]
-                if b is not null_block:
-                    self._allocator.free(b)
-                    self._blocks[idx] = null_block
-
-        # Ensure there are enough empty slots for the new tokens plus
-        # lookahead slots
-        self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
-                                    num_lookahead_slots,
-                                    extra_hash=extra_hash)
-
-        # Update the blocks with the new tokens
-        first_block_idx = self._num_full_slots // self._block_size
-        token_blocks = self._chunk_token_blocks_for_append(token_ids)
-
-        for i, token_block in enumerate(token_blocks):
-            self._blocks.append_token_ids(first_block_idx + i, token_block)
-
-        self._num_full_slots += len(token_ids)
-
-    def ensure_num_empty_slots(self,
-                               num_empty_slots: int,
-                               extra_hash: Optional[int] = None) -> None:
-        """Ensures that the BlockTable has at least the specified number of
-        empty slots available.
-
-        This method checks if the BlockTable has enough empty slots (i.e.,
-        available space) to accommodate the requested number of tokens. If not,
-        it allocates additional blocks on the GPU to ensure that the required
-        number of empty slots is available.
-
-        Args:
-            num_empty_slots (int): The minimum number of empty slots required.
-            extra_hash (Optional[int]): The hash value of additional
-                factors such as adapters that influence the block, apart
-                from the token_ids.
-        """
-        # Currently the block table only supports
-        # appending tokens to GPU blocks.
-        device = Device.GPU
-        assert self._is_allocated
-
-        if self._num_empty_slots >= num_empty_slots:
-            return
-
-        slots_to_allocate = num_empty_slots - self._num_empty_slots
-        blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
-
-        for _ in range(blocks_to_allocate):
-            assert len(self._blocks) > 0
-            self._blocks.append(
-                self._allocator.allocate_mutable_block(
-                    prev_block=self._blocks[-1],
-                    device=device,
-                    extra_hash=extra_hash))
-
-    def fork(self) -> "BlockTable":
-        """Creates a new BlockTable instance with a copy of the blocks from the
-        current instance.
-
-        This method creates a new BlockTable instance with the same block size,
-        block allocator, and a copy of the blocks from the current instance. The
-        new BlockTable has its own independent set of blocks, but shares the
-        same underlying memory allocation with the original BlockTable.
-
-        Returns:
-            BlockTable: A new BlockTable instance with a copy of the blocks from
-                the current instance.
-        """
-        assert self._is_allocated
-        assert len(self._blocks) > 0
-        forked_blocks = self._allocator.fork(self._blocks[-1])
-        return BlockTable(
-            block_size=self._block_size,
-            block_allocator=self._allocator,
-            _blocks=forked_blocks,
-            max_block_sliding_window=self._max_block_sliding_window,
-        )
-
-    def free(self) -> None:
-        """Frees the memory occupied by the blocks in the BlockTable.
-
-        This method iterates over all the blocks in the `_blocks` list and calls
-        the `free` method of the `_allocator` object to release the memory
-        occupied by each block. After freeing all the blocks, the `_blocks` list
-        is set to `None`.
-        """
-        for block in self.blocks:
-            self._allocator.free(block)
-        self._blocks.reset()
-
-    @property
-    def physical_block_ids(self) -> List[int]:
-        """Returns a list of physical block indices for the blocks in the
-        BlockTable.
-
-        This property returns a list of integers, where each integer represents
-        the physical block index of a corresponding block in the `_blocks` list.
-        The physical block index is a unique identifier for the memory location
-        occupied by the block.
-
-        Returns:
-            List[int]: A list of physical block indices for the blocks in the
-                BlockTable.
-        """
-        return self._blocks.ids()
-
-    def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
-        """Get the number of "unseen" tokens in the sequence.
-
-        Unseen tokens are tokens in the sequence corresponding to this block
-        table, but are not yet appended to this block table.
-
-        Args:
-            sequence_token_ids (List[int]): The list of token ids in the
-                sequence.
-
-        Returns:
-            List[int]: The postfix of sequence_token_ids that has not yet been
-                appended to the block table.
-        """
-
-        # Since the block table is append-only, the unseen token ids are the
-        # ones after the appended ones.
-        return sequence_token_ids[self.num_full_slots:]
-
-    def _allocate_blocks_for_token_ids(
-            self,
-            prev_block: Optional[Block],
-            token_ids: List[int],
-            device: Device,
-            extra_hash: Optional[int] = None) -> List[Block]:
-        blocks: List[Block] = []
-
-        block_token_ids = []
-        tail_token_ids = []
-        for cur_token_ids in chunk_list(token_ids, self._block_size):
-            if len(cur_token_ids) == self._block_size:
-                block_token_ids.append(cur_token_ids)
-            else:
-                tail_token_ids.append(cur_token_ids)
-
-        if block_token_ids:
-            blocks.extend(
-                self._allocator.allocate_immutable_blocks(
-                    prev_block,
-                    block_token_ids=block_token_ids,
-                    device=device,
-                    extra_hash=extra_hash))
-            prev_block = blocks[-1]
-
-        if tail_token_ids:
-            assert len(tail_token_ids) == 1
-            cur_token_ids = tail_token_ids[0]
-
-            block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device, extra_hash=extra_hash)
-            block.append_token_ids(cur_token_ids)
-
-            blocks.append(block)
-
-        return blocks
-
-    def _get_all_token_ids(self) -> List[int]:
-        # NOTE: This function is O(seq_len); use sparingly.
-        token_ids: List[int] = []
-
-        if not self._is_allocated:
-            return token_ids
-
-        for block in self.blocks:
-            token_ids.extend(block.token_ids)
-
-        return token_ids
-
-    def _get_num_token_ids(self) -> int:
-        res = 0
-        for block in self.blocks:
-            res += len(block.token_ids)
-
-        return res
-
-    @property
-    def _is_allocated(self) -> bool:
-        return len(self._blocks) > 0
-
-    @property
-    def blocks(self) -> List[Block]:
-        return self._blocks.list()
-
-    @property
-    def _num_empty_slots(self) -> int:
-        assert self._is_allocated
-        return len(self._blocks) * self._block_size - self._num_full_slots
-
-    @property
-    def num_full_slots(self) -> int:
-        """Returns the total number of tokens currently stored in the
-        BlockTable.
-
-        Returns:
-            int: The total number of tokens currently stored in the BlockTable.
-        """
-        return self._num_full_slots
-
-    def get_num_blocks_touched_by_append_slots(
-            self, token_ids: List[int], num_lookahead_slots: int) -> int:
-        """Determine how many blocks will be "touched" by appending the token
-        ids.
-
-        This is required for the scheduler to determine whether a sequence can
-        continue generation, or if it must be preempted.
-        """
-        # Math below is equivalent to:
-        # all_token_ids = token_ids + [-1] * num_lookahead_slots
-        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
-        # return len(token_blocks)
-
-        num_token_ids = len(token_ids) + num_lookahead_slots
-        first_chunk_size = self._block_size - (self._num_full_slots %
-                                               self._block_size)
-        num_token_blocks = (1 + math.ceil(
-            (num_token_ids - first_chunk_size) / self._block_size))
-        return num_token_blocks
-
-    def _chunk_token_blocks_for_append(
-            self, token_ids: List[int]) -> List[List[int]]:
-        """Split the token ids into block-sized chunks so they can be easily
-        appended to blocks. The first such "token block" may have less token ids
-        than the block size, since the last allocated block may be partially
-        full.
-
-        If no token ids are provided, then no chunks are returned.
-        """
-
-        if not token_ids:
-            return []
-
-        first_chunk_size = self._block_size - (self._num_full_slots %
-                                               self._block_size)
-        token_blocks = [token_ids[:first_chunk_size]]
-        token_blocks.extend(
-            chunk_list(token_ids[first_chunk_size:], self._block_size))
-        return token_blocks
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
deleted file mode 100644
index a337007a9eaa..000000000000
--- a/vllm/core/block/common.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections import deque
-from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
-
-from vllm.core.block.interfaces import Block, BlockAllocator
-
-BlockId = int
-RefCount = int
-
-
-class RefCounterProtocol(Protocol):
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-    def get(self, block_id: BlockId) -> RefCount:
-        raise NotImplementedError
-
-
-class RefCounter(RefCounterProtocol):
-    """A class for managing reference counts for a set of block indices.
-
-    The RefCounter class maintains a dictionary that maps block indices to their
-    corresponding reference counts. It provides methods to increment, decrement,
-    and retrieve the reference count for a given block index.
-
-    Args:
-        all_block_indices (Iterable[BlockId]): An iterable of block indices
-            to initialize the reference counter with.
-    """
-
-    def __init__(self, all_block_indices: Iterable[BlockId]):
-        deduped = set(all_block_indices)
-        self._refcounts: Dict[BlockId, RefCount] = {
-            index: 0
-            for index in deduped
-        }
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        pre_incr_refcount = self._refcounts[block_id]
-
-        assert pre_incr_refcount >= 0
-
-        post_incr_refcount = pre_incr_refcount + 1
-        self._refcounts[block_id] = post_incr_refcount
-        return post_incr_refcount
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        refcount = self._refcounts[block_id]
-
-        assert refcount > 0
-        refcount -= 1
-
-        self._refcounts[block_id] = refcount
-
-        return refcount
-
-    def get(self, block_id: BlockId) -> RefCount:
-        assert block_id in self._refcounts
-        return self._refcounts[block_id]
-
-    def as_readonly(self) -> "ReadOnlyRefCounter":
-        return ReadOnlyRefCounter(self)
-
-
-class ReadOnlyRefCounter(RefCounterProtocol):
-    """A read-only view of the RefCounter class.
-
-    The ReadOnlyRefCounter class provides a read-only interface to access the
-    reference counts maintained by a RefCounter instance. It does not allow
-    modifications to the reference counts.
-
-    Args:
-        refcounter (RefCounter): The RefCounter instance to create a read-only
-            view for.
-    """
-
-    def __init__(self, refcounter: RefCounter):
-        self._refcounter = refcounter
-
-    def incr(self, block_id: BlockId) -> RefCount:
-        raise ValueError("Incr not allowed")
-
-    def decr(self, block_id: BlockId) -> RefCount:
-        raise ValueError("Decr not allowed")
-
-    def get(self, block_id: BlockId) -> RefCount:
-        return self._refcounter.get(block_id)
-
-
-class CopyOnWriteTracker:
-    """A class for tracking and managing copy-on-write operations for blocks.
-
-    The CopyOnWriteTracker class maintains a mapping of source block indices to
-        their corresponding copy-on-write destination block indices. It works in
-        conjunction with a RefCounter.
-
-    Args:
-        refcounter (RefCounter): The reference counter used to track block
-            reference counts.
-    """
-
-    def __init__(self, refcounter: RefCounterProtocol):
-        self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
-        self._refcounter = refcounter
-
-    def is_appendable(self, block: Block) -> bool:
-        """Checks if the block is shared or not. If shared, then it cannot
-        be appended and needs to be duplicated via copy-on-write
-        """
-        block_id = block.block_id
-        if block_id is None:
-            return True
-
-        refcount = self._refcounter.get(block_id)
-        return refcount <= 1
-
-    def record_cow(self, src_block_id: Optional[BlockId],
-                   trg_block_id: Optional[BlockId]) -> None:
-        """Records a copy-on-write operation from source to target block id
-        Args:
-            src_block_id (BlockId): The source block id from which to copy 
-                the data
-            trg_block_id (BlockId): The target block id to which the data
-                is copied
-        """
-        assert src_block_id is not None
-        assert trg_block_id is not None
-        self._copy_on_writes.append((src_block_id, trg_block_id))
-
-    def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
-        """Clears the copy-on-write tracking information and returns the current
-        state.
-
-        This method returns a list mapping source block indices to
-         destination block indices for the current copy-on-write operations.
-        It then clears the internal tracking information.
-
-        Returns:
-            List[Tuple[BlockId, BlockId]]: A list mapping source
-                block indices to destination block indices for the
-                current copy-on-write operations.
-        """
-        cows = self._copy_on_writes
-        self._copy_on_writes = []
-        return cows
-
-
-class BlockPool:
-    """Used to pre-allocate block objects, in order to avoid excessive python
-    object allocations/deallocations.
-    The pool starts from "pool_size" objects and will increase to more objects
-    if necessary
-
-    Note that multiple block objects may point to the same physical block id,
-    which is why this pool is needed, so that it will be easier to support
-    prefix caching and more complicated sharing of physical blocks.
-    """
-
-    def __init__(self, block_size: int, create_block: Block.Factory,
-                 allocator: BlockAllocator, pool_size: int):
-        self._block_size = block_size
-        self._create_block = create_block
-        self._allocator = allocator
-        self._pool_size = pool_size
-        assert self._pool_size >= 0
-
-        self._free_ids: Deque[int] = deque(range(self._pool_size))
-        self._pool = []
-        for i in range(self._pool_size):
-            self._pool.append(
-                self._create_block(prev_block=None,
-                                   token_ids=[],
-                                   block_size=self._block_size,
-                                   allocator=self._allocator,
-                                   block_id=None,
-                                   extra_hash=None))
-
-    def increase_pool(self):
-        """Doubles the internal pool size
-        """
-        cur_pool_size = self._pool_size
-        new_pool_size = cur_pool_size * 2
-        self._pool_size = new_pool_size
-
-        self._free_ids += deque(range(cur_pool_size, new_pool_size))
-
-        for i in range(cur_pool_size, new_pool_size):
-            self._pool.append(
-                self._create_block(prev_block=None,
-                                   token_ids=[],
-                                   block_size=self._block_size,
-                                   allocator=self._allocator,
-                                   block_id=None,
-                                   extra_hash=None))
-
-    def init_block(self,
-                   prev_block: Optional[Block],
-                   token_ids: List[int],
-                   block_size: int,
-                   physical_block_id: Optional[int],
-                   extra_hash: Optional[int] = None) -> Block:
-        if len(self._free_ids) == 0:
-            self.increase_pool()
-            assert len(self._free_ids) > 0
-
-        pool_id = self._free_ids.popleft()
-
-        block = self._pool[pool_id]
-        block.__init__(  # type: ignore[misc]
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            allocator=block._allocator,  # type: ignore[attr-defined] 
-            block_id=physical_block_id,
-            extra_hash=extra_hash)
-        block.pool_id = pool_id  # type: ignore[attr-defined]
-        return block
-
-    def free_block(self, block: Block) -> None:
-        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
-
-
-class BlockList:
-    """This class is an optimization to allow fast-access to physical 
-    block ids. It maintains a block id list that is updated with the 
-    block list and this avoids the need to reconstruct the block id 
-    list on every iteration of the block manager
-    """
-
-    def __init__(self, blocks: List[Block]):
-        self._blocks: List[Block] = []
-        self._block_ids: List[int] = []
-
-        self.update(blocks)
-
-    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
-        assert block_id is not None
-        self._block_ids.append(block_id)
-
-    def _update_block_id(self, block_index: int,
-                         new_block_id: Optional[BlockId]) -> None:
-        assert new_block_id is not None
-        self._block_ids[block_index] = new_block_id
-
-    def update(self, blocks: List[Block]):
-        self._blocks = blocks
-
-        # Cache block ids for fast query
-        self._block_ids = []
-        for block in self._blocks:
-            self._add_block_id(block.block_id)
-
-    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
-        block = self._blocks[block_index]
-        prev_block_id = block.block_id
-
-        block.append_token_ids(token_ids)
-
-        # CoW or promotion may update the internal block_id
-        if prev_block_id != block.block_id:
-            self._update_block_id(block_index, block.block_id)
-
-    def append(self, new_block: Block):
-        self._blocks.append(new_block)
-        self._add_block_id(new_block.block_id)
-
-    def __len__(self) -> int:
-        return len(self._blocks)
-
-    def __getitem__(self, block_index: int) -> Block:
-        return self._blocks[block_index]
-
-    def __setitem__(self, block_index: int, new_block: Block) -> None:
-        self._blocks[block_index] = new_block
-        self._update_block_id(block_index, new_block.block_id)
-
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-
-    def list(self) -> List[Block]:
-        return self._blocks
-
-    def ids(self) -> List[int]:
-        return self._block_ids
-
-
-@dataclass
-class CacheMetricData:
-    """A utility dataclass to maintain cache metric.
-    To avoid overflow, we maintain the hit rate in block granularity, so that
-    we can maintain a single hit rate for n_completed_block x block_size,
-    and calculate the real time hit rate by the following:
-    BS = The number of queries per block.
-    nB = The number of completed blocks.
-    HR = hit rate of (nB x BS) queries.
-    Q = current number of queries (< BS).
-    H = current number of hits (< BS).
-    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
-    """
-    num_completed_blocks: int = 0
-    completed_block_cache_hit_rate: float = 0.0
-    num_incompleted_block_queries: int = 0
-    num_incompleted_block_hit: int = 0
-    block_size: int = 1000
-
-    def query(self, hit: bool):
-        self.num_incompleted_block_queries += 1
-        self.num_incompleted_block_hit += 1 if hit else 0
-
-        # When a block is completed, update the cache hit rate
-        # and reset the incomplete numbers.
-        if self.num_incompleted_block_queries == self.block_size:
-            hit_rate = (self.num_incompleted_block_hit /
-                        self.num_incompleted_block_queries)
-            self.completed_block_cache_hit_rate = (
-                self.completed_block_cache_hit_rate * self.num_completed_blocks
-                + hit_rate) / (self.num_completed_blocks + 1)
-            self.num_incompleted_block_queries = 0
-            self.num_incompleted_block_hit = 0
-            self.num_completed_blocks += 1
-
-    def get_hit_rate(self):
-        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
-        total_blocks = self.num_completed_blocks + incomplete_ratio
-        if total_blocks == 0:
-            return 0.0
-
-        completed_block_hit, incompleted_block_hit = 0.0, 0.0
-        if self.num_completed_blocks > 0:
-            completed_block_hit = (self.completed_block_cache_hit_rate *
-                                   self.num_completed_blocks)
-        if self.num_incompleted_block_queries > 0:
-            incompleted_hit_rate = (self.num_incompleted_block_hit /
-                                    self.num_incompleted_block_queries)
-            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
-        return (completed_block_hit + incompleted_block_hit) / total_blocks
-
-
-def get_all_blocks_recursively(last_block: Block) -> List[Block]:
-    """Retrieves all the blocks in a sequence starting from the last block.
-
-    This function recursively traverses the sequence of blocks in reverse order,
-    starting from the given last block, and returns a list of all the blocks in
-    the sequence.
-
-    Args:
-        last_block (Block): The last block in the sequence.
-
-    Returns:
-        List[Block]: A list of all the blocks in the sequence, in the order they
-            appear.
-    """
-
-    def recurse(block: Block, lst: List[Block]) -> None:
-        if block.prev_block is not None:
-            recurse(block.prev_block, lst)
-        lst.append(block)
-
-    all_blocks: List[Block] = []
-    recurse(last_block, all_blocks)
-    return all_blocks
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
deleted file mode 100644
index 92bc5e157e14..000000000000
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Dict, FrozenSet, List, Optional, Tuple
-
-from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
-                                        DeviceAwareBlockAllocator)
-from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
-from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.utils import Device
-
-
-class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
-    """A block allocator that can allocate blocks on both CPU and GPU memory.
-
-    This class implements the `DeviceAwareBlockAllocator` interface and provides
-    functionality for allocating and managing blocks of memory on both CPU and
-    GPU devices.
-
-    The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
-    blocks, and allows for allocation, deallocation, forking, and swapping of
-    blocks across these memory pools.
-    """
-
-    @staticmethod
-    def create(
-        allocator_type: str,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        block_size: int,
-    ) -> DeviceAwareBlockAllocator:
-        """Creates a CpuGpuBlockAllocator instance with the specified
-        configuration.
-
-        This static method creates and returns a CpuGpuBlockAllocator instance
-        based on the provided parameters. It initializes the CPU and GPU block
-        allocators with the specified number of blocks, block size, and
-        allocator type.
-
-        Args:
-            allocator_type (str): The type of block allocator to use for CPU
-                and GPU blocks. Currently supported values are "naive" and
-                "prefix_caching".
-            num_gpu_blocks (int): The number of blocks to allocate for GPU
-                memory.
-            num_cpu_blocks (int): The number of blocks to allocate for CPU
-                memory.
-            block_size (int): The size of each block in number of tokens.
-
-        Returns:
-            DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
-                specified configuration.
-
-        Notes:
-            - The block IDs are assigned contiguously, with GPU block IDs coming
-                before CPU block IDs.
-        """
-        reserved_blocks = 0
-        block_ids = list(
-            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
-        num_gpu_blocks -= reserved_blocks
-        gpu_block_ids = block_ids[:num_gpu_blocks]
-        cpu_block_ids = block_ids[num_gpu_blocks:]
-
-        if allocator_type == "naive":
-            gpu_allocator: BlockAllocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,  # type: ignore
-                num_blocks=num_gpu_blocks,
-                block_size=block_size,
-                block_ids=gpu_block_ids,
-            )
-
-            cpu_allocator: BlockAllocator = NaiveBlockAllocator(
-                create_block=NaiveBlock,  # type: ignore
-                num_blocks=num_cpu_blocks,
-                block_size=block_size,
-                block_ids=cpu_block_ids,
-            )
-        elif allocator_type == "prefix_caching":
-            gpu_allocator = PrefixCachingBlockAllocator(
-                num_blocks=num_gpu_blocks,
-                block_size=block_size,
-                block_ids=gpu_block_ids,
-            )
-
-            cpu_allocator = PrefixCachingBlockAllocator(
-                num_blocks=num_cpu_blocks,
-                block_size=block_size,
-                block_ids=cpu_block_ids,
-            )
-        else:
-            raise ValueError(f"Unknown allocator type {allocator_type=}")
-
-        return CpuGpuBlockAllocator(
-            cpu_block_allocator=cpu_allocator,
-            gpu_block_allocator=gpu_allocator,
-        )
-
-    def __init__(self, cpu_block_allocator: BlockAllocator,
-                 gpu_block_allocator: BlockAllocator):
-        assert not (
-            cpu_block_allocator.all_block_ids
-            & gpu_block_allocator.all_block_ids
-        ), "cpu and gpu block allocators can't have intersection of block ids"
-
-        self._allocators = {
-            Device.CPU: cpu_block_allocator,
-            Device.GPU: gpu_block_allocator,
-        }
-
-        self._swap_mapping: Dict[int, int] = {}
-        self._null_block: Optional[Block] = None
-
-        self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
-        for _, allocator in self._allocators.items():
-            for block_id in allocator.all_block_ids:
-                self._block_ids_to_allocator[block_id] = allocator
-
-    def allocate_or_get_null_block(self) -> Block:
-        if self._null_block is None:
-            self._null_block = NullBlock(
-                self.allocate_mutable_block(None, Device.GPU))
-        return self._null_block
-
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               device: Device,
-                               extra_hash: Optional[int] = None) -> Block:
-        """Allocates a new mutable block on the specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block to in the sequence.
-                Used for prefix hashing.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        return self._allocators[device].allocate_mutable_block(
-            prev_block, extra_hash=extra_hash)
-
-    def allocate_immutable_blocks(
-            self,
-            prev_block: Optional[Block],
-            block_token_ids: List[List[int]],
-            device: Device,
-            extra_hash: Optional[int] = None) -> List[Block]:
-        """Allocates a new group of immutable blocks with the provided block 
-        token IDs on the specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-                Used for prefix hashing.
-            block_token_ids (List[int]): The list of block token IDs to be 
-                stored in the new blocks.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            List[Block]: The newly allocated list of immutable blocks 
-                containing the provided block token IDs.
-        """
-        return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids, extra_hash=extra_hash)
-
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 device: Device,
-                                 extra_hash: Optional[int] = None) -> Block:
-        """Allocates a new immutable block with the provided token IDs on the
-        specified device.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-                Used for prefix hashing.
-            token_ids (List[int]): The list of token IDs to be stored in the new
-                block.
-            device (Device): The device on which to allocate the new block.
-            extra_hash (Optional[int]): The hash value of additional
-                factors, such as adapters, that influence the block hash
-                in the prefix caching block.
-
-        Returns:
-            Block: The newly allocated immutable block containing the provided
-                token IDs.
-        """
-        return self._allocators[device].allocate_immutable_block(
-            prev_block, token_ids, extra_hash=extra_hash)
-
-    def free(self, block: Block) -> None:
-        """Frees the memory occupied by the given block.
-
-        Args:
-            block (Block): The block to be freed.
-        """
-        # Null block should never be freed
-        if isinstance(block, NullBlock):
-            return
-        block_id = block.block_id
-        assert block_id is not None
-        allocator = self._block_ids_to_allocator[block_id]
-        allocator.free(block)
-
-    def fork(self, last_block: Block) -> List[Block]:
-        """Creates a new sequence of blocks that shares the same underlying
-            memory as the original sequence.
-
-        Args:
-            last_block (Block): The last block in the original sequence.
-
-        Returns:
-            List[Block]: A new list of blocks that shares the same memory as the
-                original sequence.
-        """
-        # do not attempt to fork the null block
-        assert not isinstance(last_block, NullBlock)
-        block_id = last_block.block_id
-        assert block_id is not None
-        allocator = self._block_ids_to_allocator[block_id]
-        return allocator.fork(last_block)
-
-    def get_num_free_blocks(self, device: Device) -> int:
-        """Returns the number of free blocks available on the specified device.
-
-        Args:
-            device (Device): The device for which to query the number of free
-                blocks. AssertionError is raised if None is passed.
-
-        Returns:
-            int: The number of free blocks available on the specified device.
-        """
-        return self._allocators[device].get_num_free_blocks()
-
-    def get_num_total_blocks(self, device: Device) -> int:
-        return self._allocators[device].get_num_total_blocks()
-
-    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
-        """Returns the zero-offset block id on certain device given the 
-        absolute block id.
-
-        Args:
-            device (Device): The device for which to query relative block id.
-                absolute_id (int): The absolute block id for the block in 
-                whole allocator.
-
-        Returns:
-            int: The zero-offset block id on certain device.
-        """
-        return self._allocators[device].get_physical_block_id(absolute_id)
-
-    def swap(self, blocks: List[Block], src_device: Device,
-             dst_device: Device) -> Dict[int, int]:
-        """Execute the swap for the given blocks from source_device
-        on to dest_device, save the current swap mapping and append 
-        them to the accumulated `self._swap_mapping` for each 
-        scheduling move.
-
-        Args:
-            blocks: List of blocks to be swapped.
-            src_device (Device): Device to swap the 'blocks' from.
-            dst_device (Device): Device to swap the 'blocks' to.
-        
-        Returns:
-            Dict[int, int]: Swap mapping from source_device
-                on to dest_device.
-        """
-        src_block_ids = [block.block_id for block in blocks]
-        self._allocators[src_device].swap_out(blocks)
-        self._allocators[dst_device].swap_in(blocks)
-        dst_block_ids = [block.block_id for block in blocks]
-
-        current_swap_mapping: Dict[int, int] = {}
-        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
-            if src_block_id is not None and dst_block_id is not None:
-                self._swap_mapping[src_block_id] = dst_block_id
-                current_swap_mapping[src_block_id] = dst_block_id
-        return current_swap_mapping
-
-    def get_num_full_blocks_touched(self, blocks: List[Block],
-                                    device: Device) -> int:
-        """Returns the number of full blocks that will be touched by
-        swapping in/out the given blocks on to the 'device'.
-
-        Args:
-            blocks: List of blocks to be swapped.
-            device (Device): Device to swap the 'blocks' on.
-
-        Returns:
-            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks on to the 'device'.
-                Non full blocks are ignored when deciding the number
-                of blocks to touch.
-        """
-        return self._allocators[device].get_num_full_blocks_touched(blocks)
-
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        """Clears the copy-on-write (CoW) state and returns the mapping of
-            source to destination block IDs.
-
-        Returns:
-            List[Tuple[int, int]]: A list mapping source block IDs to 
-                destination block IDs.
-        """
-        # CoW only supported on GPU
-        device = Device.GPU
-        return self._allocators[device].clear_copy_on_writes()
-
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        """Mark blocks as accessed, only use for prefix caching."""
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
-
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as accessed, only use for prefix caching."""
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].mark_blocks_as_computed(block_ids)
-
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        # Prefix caching only supported on GPU.
-        device = Device.GPU
-        return self._allocators[device].get_common_computed_block_ids(
-            computed_seq_block_ids)
-
-    @property
-    def all_block_ids(self) -> FrozenSet[int]:
-        return frozenset(self._block_ids_to_allocator.keys())
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        assert device in self._allocators
-        return self._allocators[device].get_prefix_cache_hit_rate()
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for specified or all devices."""
-        if device:
-            return self._allocators[device].reset_prefix_cache()
-        success = True
-        for allocator in self._allocators.values():
-            success = success and allocator.reset_prefix_cache()
-        return success
-
-    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
-        """Returns and clears the mapping of source to destination block IDs.
-        Will be called after every swapping operations for now, and after every
-        schedule when BlockManagerV2 become default. Currently not useful.
-
-        Returns:
-            List[Tuple[int, int]]: A mapping of source to destination block IDs.
-        """
-        mapping = self._swap_mapping.copy()
-        self._swap_mapping.clear()
-        return list(mapping.items())
-
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-        device: Device = Device.GPU,
-    ) -> List[int]:
-        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
-
-
-class NullBlock(Block):
-    """
-    Null blocks are used as a placeholders for KV cache blocks that have
-    been dropped due to sliding window.
-    This implementation just wraps an ordinary block and prevents it from
-    being modified. It also allows for testing if a block is NullBlock
-    via isinstance().
-    """
-
-    def __init__(self, proxy: Block):
-        super().__init__()
-        self._proxy = proxy
-
-    def append_token_ids(self, token_ids: List[BlockId]):
-        raise ValueError("null block should not be modified")
-
-    @property
-    def block_id(self):
-        return self._proxy.block_id
-
-    @block_id.setter
-    def block_id(self, value: Optional[BlockId]):
-        raise ValueError("null block should not be modified")
-
-    @property
-    def token_ids(self) -> List[BlockId]:
-        return self._proxy.token_ids
-
-    @property
-    def num_tokens_total(self) -> int:
-        raise NotImplementedError(
-            "num_tokens_total is not used for null block")
-
-    @property
-    def num_empty_slots(self) -> BlockId:
-        return self._proxy.num_empty_slots
-
-    @property
-    def is_full(self):
-        return self._proxy.is_full
-
-    @property
-    def prev_block(self):
-        return self._proxy.prev_block
-
-    @property
-    def extra_hash(self):
-        return None
-
-    @property
-    def computed(self):
-        return self._proxy.computed
-
-    @computed.setter
-    def computed(self, value):
-        self._proxy.computed = value
-
-    @property
-    def last_accessed(self) -> float:
-        return self._proxy.last_accessed
-
-    @last_accessed.setter
-    def last_accessed(self, last_accessed_ts: float):
-        self._proxy.last_accessed = last_accessed_ts
-
-    @property
-    def content_hash(self):
-        return self._proxy.content_hash
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
deleted file mode 100644
index 1a05881f7c00..000000000000
--- a/vllm/core/block/interfaces.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from abc import ABC, abstractmethod
-from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
-
-from vllm.utils import Device
-
-BlockId = int
-
-
-class Block(ABC):
-
-    @abstractmethod
-    def append_token_ids(self, token_ids: List[int]) -> None:
-        pass
-
-    @property
-    @abstractmethod
-    def block_id(self) -> Optional[int]:
-        pass
-
-    @block_id.setter
-    @abstractmethod
-    def block_id(self, value: Optional[int]) -> None:
-        """NOTE: Do not use this API outside Block."""
-        self._block_id = value
-
-    @property
-    @abstractmethod
-    def token_ids(self) -> List[int]:
-        pass
-
-    @property
-    @abstractmethod
-    def num_tokens_total(self) -> int:
-        """The number of tokens till the current block (inclusive)
-        """
-        pass
-
-    @property
-    @abstractmethod
-    def num_empty_slots(self) -> int:
-        pass
-
-    @property
-    @abstractmethod
-    def is_full(self) -> bool:
-        pass
-
-    @property
-    @abstractmethod
-    def prev_block(self) -> Optional["Block"]:
-        pass
-
-    @property
-    @abstractmethod
-    def extra_hash(self) -> Optional[int]:
-        return None
-
-    @property
-    @abstractmethod
-    def computed(self) -> bool:
-        raise NotImplementedError
-
-    @computed.setter
-    @abstractmethod
-    def computed(self, value) -> bool:
-        """Should be only used by PrefixCacingAllocator"""
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def last_accessed(self) -> float:
-        raise NotImplementedError
-
-    @last_accessed.setter
-    @abstractmethod
-    def last_accessed(self, last_accessed_ts: float):
-        raise NotImplementedError
-
-    class Factory(Protocol):
-
-        @abstractmethod
-        def __call__(
-            self,
-            prev_block: Optional["Block"],
-            token_ids: List[int],
-            block_size: int,
-            allocator: "BlockAllocator",
-            block_id: Optional[int] = None,
-            computed: bool = False,
-            extra_hash: Optional[int] = None,
-        ) -> "Block":
-            pass
-
-    @property
-    @abstractmethod
-    def content_hash(self) -> Optional[int]:
-        """Return the content-based hash of the current block, or None if it is
-        not yet defined or not supported.
-
-        For the content-based hash to be defined, the current block must be
-        full.
-        """
-        return None
-
-
-class BlockAllocator(ABC):
-
-    @abstractmethod
-    def allocate_mutable_block(self, prev_block: Optional[Block],
-                               extra_hash: Optional[int]) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_block(self, prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 extra_hash: Optional[int]) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_blocks(self, prev_block: Optional[Block],
-                                  block_token_ids: List[List[int]],
-                                  extra_hash: Optional[int]) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def free(self, block: Block) -> None:
-        pass
-
-    @abstractmethod
-    def fork(self, last_block: Block) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def get_num_total_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_free_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_physical_block_id(self, absolute_id: int) -> int:
-        pass
-
-    @abstractmethod
-    def swap_out(self, blocks: List[Block]) -> None:
-        pass
-
-    @abstractmethod
-    def swap_in(self, blocks: List[Block]) -> None:
-        pass
-
-    @property
-    @abstractmethod
-    def all_block_ids(self) -> FrozenSet[int]:
-        pass
-
-    @abstractmethod
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        pass
-
-    @abstractmethod
-    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
-        """NOTE: This should not be used besides Block"""
-        pass
-
-    @abstractmethod
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        """NOTE: This should not be used besides Block"""
-        pass
-
-    @abstractmethod
-    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache."""
-        pass
-
-    class NoFreeBlocksError(ValueError):
-        pass
-
-    @abstractmethod
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-    ) -> List[int]:
-        pass
-
-
-class DeviceAwareBlockAllocator(ABC):
-
-    @abstractmethod
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               device: Device,
-                               extra_hash: Optional[int] = None) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 device: Device,
-                                 extra_hash: Optional[int] = None) -> Block:
-        pass
-
-    @abstractmethod
-    def allocate_immutable_blocks(
-        self,
-        prev_block: Optional[Block],
-        block_token_ids: List[List[int]],
-        device: Device,
-        extra_hash: Optional[int] = None,
-    ) -> List[Block]:
-        pass
-
-    @abstractmethod
-    def get_num_free_blocks(self, device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_total_blocks(self, device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def free(self, block: Block) -> None:
-        pass
-
-    @abstractmethod
-    def fork(self, last_block: Block) -> List[Block]:
-        pass
-
-    @property
-    @abstractmethod
-    def all_block_ids(self) -> FrozenSet[int]:
-        pass
-
-    @abstractmethod
-    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        pass
-
-    @abstractmethod
-    def get_num_full_blocks_touched(self, blocks: List[Block],
-                                    device: Device) -> int:
-        pass
-
-    @abstractmethod
-    def swap(self, blocks: List[Block], src_device: Device,
-             dst_device: Device) -> Dict[int, int]:
-        pass
-
-    @abstractmethod
-    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
-        pass
-
-    @abstractmethod
-    def allocate_or_get_null_block(self) -> Block:
-        """
-        Null blocks are used as a placeholders for KV cache blocks that have
-        been dropped due to sliding window.
-        There is at most one null block per allocator.
-        """
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache."""
-        pass
-
-    @abstractmethod
-    def find_cached_blocks_prefix(
-        self,
-        block_hashes: List[int],
-        device: Device = Device.GPU,
-    ) -> List[int]:
-        pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
deleted file mode 100644
index dae6ead04e9c..000000000000
--- a/vllm/core/block/naive_block.py
+++ /dev/null
@@ -1,466 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from collections import deque
-from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
-
-from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
-                                    get_all_blocks_recursively)
-from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
-
-Refcount = int
-
-
-class NaiveBlockAllocator(BlockAllocator):
-    """A simple block allocator that manages blocks of memory without prefix
-    caching.
-
-    Args:
-        create_block (Block.Factory): A factory function for creating new
-            blocks. This is used when a NaiveBlockAllocator is composed within
-            a prefix caching allocator -- the naive block allocator must
-            construct prefix caching blocks (but shouldn't know anything else
-            about them).
-        num_blocks (int): The total number of blocks to manage.
-        block_size (int): The size of each block in tokens.
-        block_ids (Optional[Iterable[int]], optional): An optional iterable of
-            block IDs. If not provided, block IDs will be assigned sequentially
-            from 0 to num_blocks - 1.
-    """
-
-    def __init__(
-        self,
-        create_block: Block.Factory,
-        num_blocks: int,
-        block_size: int,
-        block_ids: Optional[Iterable[int]] = None,
-        block_pool: Optional[BlockPool] = None,
-    ):
-        if block_ids is None:
-            block_ids = range(num_blocks)
-
-        self._free_block_indices: Deque[BlockId] = deque(block_ids)
-        self._all_block_indices = frozenset(block_ids)
-        assert len(self._all_block_indices) == num_blocks
-
-        self._refcounter = RefCounter(
-            all_block_indices=self._free_block_indices)
-        self._block_size = block_size
-
-        self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly())
-
-        if block_pool is None:
-            extra_factor = 4
-            # Pre-allocate "num_blocks * extra_factor" block objects.
-            # The "* extra_factor" is a buffer to allow more block objects
-            # than physical blocks
-            self._block_pool = BlockPool(self._block_size, create_block, self,
-                                         num_blocks * extra_factor)
-        else:
-            # In this case, the block pool is provided by the caller,
-            # which means that there is most likely a need to share
-            # a block pool between allocators
-            self._block_pool = block_pool
-
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 extra_hash: Optional[int] = None,
-                                 device: Optional[Device] = None) -> Block:
-        """Allocates a new immutable block with the given token IDs, linked to
-        the previous block.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-            token_ids (List[int]): The token IDs to be stored in the new block.
-
-        Returns:
-            Block: The newly allocated immutable block.
-        """
-        assert device is None
-        block = self.allocate_mutable_block(prev_block=prev_block)
-        block.append_token_ids(token_ids)
-        return block
-
-    def allocate_immutable_blocks(
-            self,
-            prev_block: Optional[Block],
-            block_token_ids: List[List[int]],
-            extra_hash: Optional[int] = None,
-            device: Optional[Device] = None) -> List[Block]:
-        assert device is None
-        num_blocks = len(block_token_ids)
-
-        block_ids = []
-        for i in range(num_blocks):
-            block_ids.append(self._allocate_block_id())
-
-        blocks = []
-        for i in range(num_blocks):
-            prev_block = self._block_pool.init_block(
-                prev_block=prev_block,
-                token_ids=block_token_ids[i],
-                block_size=self._block_size,
-                physical_block_id=block_ids[i])
-            blocks.append(prev_block)
-
-        return blocks
-
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               extra_hash: Optional[int] = None,
-                               device: Optional[Device] = None) -> Block:
-        """Allocates a new mutable block, linked to the previous block.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        assert device is None
-        block_id = self._allocate_block_id()
-        block = self._block_pool.init_block(prev_block=prev_block,
-                                            token_ids=[],
-                                            block_size=self._block_size,
-                                            physical_block_id=block_id)
-        return block
-
-    def _allocate_block_id(self) -> BlockId:
-        if not self._free_block_indices:
-            raise BlockAllocator.NoFreeBlocksError()
-
-        block_id = self._free_block_indices.popleft()
-        self._refcounter.incr(block_id)
-        return block_id
-
-    def _free_block_id(self, block: Union[Block, BlockId]) -> None:
-        if isinstance(block, Block):
-            block_id = block.block_id
-            block.block_id = None
-        else:
-            block_id = block
-        assert block_id is not None
-
-        refcount = self._refcounter.decr(block_id)
-        if refcount == 0:
-            self._free_block_indices.appendleft(block_id)
-
-    def free(self, block: Block, keep_block_object: bool = False) -> None:
-        # Release the physical block id
-        self._free_block_id(block)
-
-        # Release the block object
-        if not keep_block_object:
-            self._block_pool.free_block(block)
-
-    def free_block_id(self, block_id: BlockId) -> None:
-        self._free_block_id(block_id)
-
-    def fork(self, last_block: Block) -> List[Block]:
-        """Creates a new sequence of blocks that shares the same underlying
-        memory as the original sequence.
-
-        Args:
-            last_block (Block): The last block in the original sequence.
-
-        Returns:
-            List[Block]: The new sequence of blocks that shares the same memory
-                as the original sequence.
-        """
-        source_blocks = get_all_blocks_recursively(last_block)
-
-        forked_blocks: List[Block] = []
-        prev_block = None
-        for block in source_blocks:
-
-            # Increment refcount for each block.
-            assert block.block_id is not None
-            refcount = self._refcounter.incr(block.block_id)
-            assert refcount != 1, "can't fork free'd block"
-
-            forked_block = self._block_pool.init_block(
-                prev_block=prev_block,
-                token_ids=block.token_ids,
-                block_size=self._block_size,
-                physical_block_id=block.block_id)
-
-            forked_blocks.append(forked_block)
-            prev_block = forked_blocks[-1]
-
-        return forked_blocks
-
-    def get_num_free_blocks(self) -> int:
-        return len(self._free_block_indices)
-
-    def get_num_total_blocks(self) -> int:
-        return len(self._all_block_indices)
-
-    def get_physical_block_id(self, absolute_id: int) -> int:
-        """Returns the zero-offset block id on certain block allocator
-        given the absolute block id.
-
-        Args:
-            absolute_id (int): The absolute block id for the block 
-            in whole allocator.
-
-        Returns:
-            int: The zero-offset block id on certain device.
-        """
-        return sorted(self._all_block_indices).index(absolute_id)
-
-    @property
-    def refcounter(self):
-        return self._refcounter
-
-    @property
-    def all_block_ids(self) -> FrozenSet[int]:
-        return self._all_block_indices
-
-    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
-        """Performs a copy-on-write operation on the given block if it is not
-        appendable.
-
-        Args:
-            block (Block): The block to check for copy-on-write.
-
-        Returns:
-            BlockId: The block index of the new block if a copy-on-write 
-                operation was performed, or the original block index if
-                no copy-on-write was necessary.
-        """
-        src_block_id = block.block_id
-        assert src_block_id is not None
-
-        if self._cow_tracker.is_appendable(block):
-            return src_block_id
-
-        self._free_block_id(block)
-        trg_block_id = self._allocate_block_id()
-
-        self._cow_tracker.record_cow(src_block_id, trg_block_id)
-
-        return trg_block_id
-
-    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
-        """Returns the copy-on-write source->destination mapping and clears it.
-
-        Returns:
-            List[Tuple[BlockId, BlockId]]: A list mapping source
-                block indices to destination block indices.
-        """
-        return self._cow_tracker.clear_cows()
-
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        """Mark blocks as accessed, used in prefix caching.
-
-        Since the naive allocator does not implement prefix caching, we do
-        nothing.
-        """
-        pass
-
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        """Mark blocks as computed, used in prefix caching.
-
-        Since the naive allocator does not implement prefix caching, we do
-        nothing.
-        """
-        pass
-
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        """Determine blocks that can be skipped in prefill.
-
-        Since the naive allocator does not support prefix caching, always return
-        an empty list.
-        """
-        return []
-
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        raise NotImplementedError("There is no promotion for naive blocks")
-
-    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-        """Returns the number of full blocks that will be touched by
-        swapping in/out.
-
-        Args:
-            blocks: List of blocks to be swapped.
-        Returns:
-            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks. Non full blocks are ignored
-                when deciding the number of blocks to touch.
-        """
-        # NOTE: for naive block, we use set to eliminate common blocks among
-        # seqs, also we compare the empty slots in the mutable blocks with
-        # lookahead slots to get the number of unique new block that are
-        # needed.
-        old_block_set = set()
-        for block in blocks:
-            if block.is_full:
-                old_block_set.add(block)
-        return len(old_block_set)
-
-    def swap_out(self, blocks: List[Block]) -> None:
-        for block in blocks:
-            self._free_block_id(block)
-
-    def swap_in(self, blocks: List[Block]) -> None:
-        for block in blocks:
-            # Here we allocate either immutable or mutable block and then
-            # extract its block_id. Note that the block object is released
-            # and the block_id is assigned to "block" to allow reusing the
-            # existing "block" object
-            if block.is_full:
-                tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block, token_ids=block.token_ids)
-            else:
-                tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block)
-                tmp_block.append_token_ids(block.token_ids)
-
-            block_id = tmp_block.block_id
-            tmp_block.block_id = None
-            self._block_pool.free_block(tmp_block)
-
-            block.block_id = block_id  # Assign block_id
-
-    def get_prefix_cache_hit_rate(self) -> float:
-        return -1
-
-    def reset_prefix_cache(self) -> bool:
-        """No prefix cache for naive block allocator."""
-        return True
-
-    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
-        # Not applicable for naive block allocator.
-        return []
-
-
-class NaiveBlock(Block):
-    """An implementation of the Block class that does not support prefix
-    caching.
-
-    The NaiveBlock class represents a block of token IDs with a fixed size. It
-    provides methods for appending token IDs to the block and manages copy-on
-    -write operations when necessary.
-
-    Args:
-        prev_block (Block): The previous block in the sequence.
-        token_ids (List[int]): The initial token IDs to be stored in the block.
-        block_size (int): The maximum number of token IDs that can be stored in
-            the block.
-        allocator (BlockAllocator): The block allocator associated with this
-            block.
-        block_id (Optional[int], optional): The physical block index
-            of this block. Defaults to None, which means no allocation has been
-            made.
-        _cow_target (Optional[Block], optional): The copy-on-write target block.
-            If not provided, it defaults to self.
-    """
-
-    def __init__(self,
-                 prev_block: Optional[Block],
-                 token_ids: List[int],
-                 block_size: int,
-                 allocator: BlockAllocator,
-                 block_id: Optional[int] = None,
-                 _cow_target: Optional[Block] = None,
-                 extra_hash: Optional[int] = None):
-        self._token_ids: List[int] = []
-        self._block_size = block_size
-        self._prev_block = prev_block
-        self._block_id = block_id
-        self._allocator = allocator
-        self._cow_target = _cow_target if _cow_target is not None else self
-
-        self._append_token_ids_no_cow(token_ids)
-
-    def append_token_ids(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block and performs a 
-        copy-on-write if necessary.
-
-        Args:
-            token_ids (Optional[List[int]]): The token IDs to be appended 
-                to the block.
-        """
-        self._append_token_ids_no_cow(token_ids)
-
-        if self._block_id is not None:
-            self._block_id = (self._allocator.cow_block_if_not_appendable(
-                self._cow_target))
-
-    def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block
-
-        Args:
-            token_ids (List[int]): The token IDs to be appended to the block.
-        """
-        if len(token_ids) == 0:
-            return
-
-        assert len(token_ids) <= self.num_empty_slots
-
-        self._token_ids.extend(token_ids)
-
-    @property
-    def computed(self) -> bool:
-        raise NotImplementedError
-
-    @computed.setter
-    def computed(self, value) -> None:
-        raise NotImplementedError
-
-    @property
-    def last_accessed(self) -> float:
-        raise NotImplementedError
-
-    @last_accessed.setter
-    def last_accessed(self, last_accessed_ts: float):
-        raise NotImplementedError
-
-    @property
-    def block_id(self) -> Optional[int]:
-        return self._block_id
-
-    @block_id.setter
-    def block_id(self, value: Optional[int]) -> None:
-        self._block_id = value
-
-    @property
-    def is_full(self) -> bool:
-        return self.num_empty_slots == 0
-
-    @property
-    def num_empty_slots(self) -> int:
-        return self._block_size - len(self.token_ids)
-
-    @property
-    def token_ids(self) -> List[int]:
-        return self._token_ids
-
-    @property
-    def num_tokens_total(self) -> int:
-        raise NotImplementedError(
-            "num_tokens_total is not used for naive block")
-
-    @property
-    def block_size(self) -> int:
-        return self._block_size
-
-    @property
-    def prev_block(self) -> Optional["Block"]:
-        return self._prev_block
-
-    @property
-    def extra_hash(self):
-        return None
-
-    @property
-    def content_hash(self) -> Optional[int]:
-        return None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
deleted file mode 100644
index 2913a01bf34a..000000000000
--- a/vllm/core/block/prefix_caching_block.py
+++ /dev/null
@@ -1,1135 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Token blocks."""
-import sys
-from bisect import bisect_left
-from os.path import commonprefix
-from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
-                    Tuple)
-
-from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
-                                    get_all_blocks_recursively)
-from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
-                                        DeviceAwareBlockAllocator)
-from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
-                                         NaiveBlockAllocator)
-from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
-from vllm.logger import init_logger
-from vllm.sequence import Sequence
-
-PrefixHash = int
-
-# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
-# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
-# then we know this block hasn't been accessed yet.
-_DEFAULT_LAST_ACCESSED_TIME = -1
-
-logger = init_logger(__name__)
-
-
-class BlockTracker:
-    """Used to track the status of a block inside the prefix caching allocator
-    """
-    __slots__ = ("active", "last_accessed", "computed")
-
-    def reset(self):
-        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
-        self.computed: bool = False
-
-    def __init__(self):
-        self.active: bool = False
-        self.reset()
-
-    def enable(self):
-        assert not self.active
-        self.active = True
-        self.reset()
-
-    def disable(self):
-        assert self.active
-        self.active = False
-        self.reset()
-
-
-class PrefixCachingBlockAllocator(BlockAllocator):
-    """A block allocator that implements prefix caching.
-
-    The PrefixCachingBlockAllocator maintains a cache of blocks based on their
-    content hash. It reuses blocks with the same content hash to avoid redundant
-    memory allocation. The allocator also supports copy-on-write operations.
-
-    Args:
-        num_blocks (int): The total number of blocks to manage.
-        block_size (int): The size of each block in tokens.
-        block_ids(Optional[Iterable[int]], optional): An optional iterable of
-            block IDs. If not provided, block IDs will be assigned sequentially
-            from 0 to num_blocks - 1.
-    """
-
-    # Note that we use 'None' as a string here instead of None because
-    # as of Python 3.12, hash(None) returns a constant predictable value.
-    # This could possibly make it easier to find and exploit hash
-    # collisions. 'None' as a string will be hashed differently per process,
-    # but consistently within the same process. This is the same as the
-    # behavior of None prior to Python 3.12.
-    _none_hash: int = hash('None')
-
-    # Implements Block.Factory.
-    def __init__(
-        self,
-        num_blocks: int,
-        block_size: int,
-        block_ids: Optional[Iterable[int]] = None,
-        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
-    ):
-        if block_ids is None:
-            block_ids = range(num_blocks)
-
-        self._block_size = block_size
-
-        # A mapping of prefix hash to block index. All blocks which have a
-        # prefix hash will be in this dict, even if they have refcount 0.
-        self._cached_blocks: Dict[PrefixHash, BlockId] = {}
-
-        # A list of immutable block IDs that have been touched by scheduler
-        # and should be marked as computed after an entire batch of sequences
-        # are scheduled.
-        self._touched_blocks: Set[BlockId] = set()
-
-        # Used to track status of each physical block id
-        self._block_tracker: Dict[BlockId, BlockTracker] = {}
-        for block_id in block_ids:
-            self._block_tracker[block_id] = BlockTracker()
-
-        # Pre-allocate "num_blocks * extra_factor" block objects.
-        # The "* extra_factor" is a buffer to allow more block objects
-        # than physical blocks
-        extra_factor = 4
-        self._block_pool = BlockPool(self._block_size, self._create_block,
-                                     self, num_blocks * extra_factor)
-
-        # An allocator for blocks that do not have prefix hashes.
-        self._hashless_allocator = NaiveBlockAllocator(
-            create_block=self._create_block,  # type: ignore
-            num_blocks=num_blocks,
-            block_size=block_size,
-            block_ids=block_ids,
-            block_pool=self._block_pool,  # Share block pool here
-        )
-
-        # Evitor used to maintain how we want to handle those computed blocks
-        # if we find memory pressure is high.
-        self.eviction_policy = eviction_policy
-        self.evictor: Evictor = make_evictor(self.eviction_policy)
-
-        # We share the refcounter between allocators. This allows us to promote
-        # blocks originally allocated in the hashless allocator to immutable
-        # blocks.
-        self._refcounter = self._hashless_allocator.refcounter
-
-        self._cow_tracker = CopyOnWriteTracker(
-            refcounter=self._refcounter.as_readonly())
-
-        self.metric_data = CacheMetricData()
-
-    def _create_block(
-        self,
-        prev_block: Optional[Block],
-        token_ids: List[int],
-        block_size: int,
-        allocator: BlockAllocator,
-        block_id: Optional[int] = None,
-        computed: bool = False,
-        extra_hash: Optional[int] = None,
-    ) -> Block:
-        # Bind block to self.
-        allocator = self
-
-        return PrefixCachingBlock(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_size=block_size,
-            block_id=block_id,
-            allocator=allocator,
-            computed=computed,
-            extra_hash=extra_hash,
-        )
-
-    def allocate_immutable_block(self,
-                                 prev_block: Optional[Block],
-                                 token_ids: List[int],
-                                 extra_hash: Optional[int] = None,
-                                 device: Optional[Device] = None) -> Block:
-        """Allocates an immutable block with the given token IDs, reusing cached
-        blocks if possible.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-            token_ids (List[int]): The token IDs to be stored in the block.
-
-        Returns:
-            Block: The allocated immutable block.
-        """
-        assert device is None
-        assert_prefix_caching_block_or_none(prev_block)
-
-        # First, try to create a block that points to cached data
-        block = self._block_pool.init_block(prev_block=prev_block,
-                                            token_ids=token_ids,
-                                            block_size=self._block_size,
-                                            physical_block_id=None,
-                                            extra_hash=extra_hash)
-        assert block.content_hash is not None
-
-        cached_block_id = self._cached_blocks.get(block.content_hash, None)
-        if cached_block_id is not None:
-            self.metric_data.query(hit=True)
-            block.block_id = cached_block_id
-            self._incr_refcount_cached_block(block)
-            return block
-        self.metric_data.query(hit=False)
-        self._block_pool.free_block(block)
-
-        # No cached block => Allocate a new block
-        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
-        block.append_token_ids(token_ids)
-        return block
-
-    def allocate_immutable_blocks(
-            self,
-            prev_block: Optional[Block],
-            block_token_ids: List[List[int]],
-            extra_hash: Optional[int] = None,
-            device: Optional[Device] = None) -> List[Block]:
-        blocks = []
-        for token_ids in block_token_ids:
-            prev_block = self.allocate_immutable_block(prev_block=prev_block,
-                                                       token_ids=token_ids,
-                                                       device=device,
-                                                       extra_hash=extra_hash)
-            blocks.append(prev_block)
-        return blocks
-
-    def allocate_mutable_block(self,
-                               prev_block: Optional[Block],
-                               extra_hash: Optional[int] = None,
-                               device: Optional[Device] = None) -> Block:
-        """Allocates a mutable block. If there are no free blocks, this will
-        evict unused cached blocks.
-
-        Args:
-            prev_block (Block): The previous block in the sequence.
-                None is not allowed unlike it is super class.
-
-        Returns:
-            Block: The allocated mutable block.
-        """
-        assert device is None
-        assert_prefix_caching_block_or_none(prev_block)
-
-        block_id = self._allocate_block_id()
-        block = self._block_pool.init_block(prev_block=prev_block,
-                                            token_ids=[],
-                                            block_size=self._block_size,
-                                            physical_block_id=block_id,
-                                            extra_hash=extra_hash)
-        assert not block.computed
-        assert block.content_hash is None
-        return block
-
-    def _incr_refcount_cached_block(self, block: Block) -> None:
-        # Set this block to be "computed" since it is pointing to a
-        # cached block id (which was already computed)
-        block.computed = True
-
-        block_id = block.block_id
-        assert block_id is not None
-
-        refcount = self._refcounter.incr(block_id)
-        if refcount == 1:
-            # In case a cached block was evicted, restore its tracking
-            if block_id in self.evictor:
-                self.evictor.remove(block_id)
-
-            self._track_block_id(block_id, computed=True)
-
-    def _decr_refcount_cached_block(self, block: Block) -> None:
-        # Ensure this is immutable/cached block
-        assert block.content_hash is not None
-
-        block_id = block.block_id
-        assert block_id is not None
-
-        refcount = self._refcounter.decr(block_id)
-        if refcount > 0:
-            block.block_id = None
-            return
-        else:
-            assert refcount == 0
-
-        # No longer used
-        assert block.content_hash in self._cached_blocks
-
-        # Add the cached block to the evictor
-        # (This keeps the cached block around so it can be reused)
-        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
-                         self._block_tracker[block_id].last_accessed)
-
-        # Stop tracking the block
-        self._untrack_block_id(block_id)
-
-        block.block_id = None
-
-    def _decr_refcount_hashless_block(self, block: Block) -> None:
-        block_id = block.block_id
-        assert block_id is not None
-
-        # We may have a fork case where block is shared,
-        # in which case, we cannot remove it from tracking
-        refcount = self._refcounter.get(block_id)
-        if refcount == 1:
-            self._untrack_block_id(block_id)
-
-        # Decrement refcount of the block_id, but do not free the block object
-        # itself (will be handled by the caller)
-        self._hashless_allocator.free(block, keep_block_object=True)
-
-    def _allocate_block_id(self) -> BlockId:
-        """First tries to allocate a block id from the hashless allocator,
-        and if there are no blocks, then tries to evict an unused cached block.
-        """
-        hashless_block_id = self._maybe_allocate_hashless_block_id()
-        if hashless_block_id is not None:
-            return hashless_block_id
-
-        evicted_block_id = self._maybe_allocate_evicted_block_id()
-        if evicted_block_id is not None:
-            return evicted_block_id
-
-        # No block available in hashless allocator, nor in unused cache blocks.
-        raise BlockAllocator.NoFreeBlocksError()
-
-    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
-        try:
-            # Allocate mutable block and extract its block_id
-            block = self._hashless_allocator.allocate_mutable_block(
-                prev_block=None)
-            block_id = block.block_id
-            self._block_pool.free_block(block)
-
-            self._track_block_id(block_id, computed=False)
-            return block_id
-        except BlockAllocator.NoFreeBlocksError:
-            return None
-
-    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
-        if self.evictor.num_blocks == 0:
-            return None
-
-        # Here we get an evicted block, which is only added
-        # into evictor if its ref counter is 0
-        # and since its content would be changed, we need
-        # to remove it from _cached_blocks's tracking list
-        block_id, content_hash_to_evict = self.evictor.evict()
-
-        # Sanity checks
-        assert content_hash_to_evict in self._cached_blocks
-        _block_id = self._cached_blocks[content_hash_to_evict]
-        assert self._refcounter.get(_block_id) == 0
-        assert _block_id == block_id
-
-        self._cached_blocks.pop(content_hash_to_evict)
-
-        self._refcounter.incr(block_id)
-        self._track_block_id(block_id, computed=False)
-
-        return block_id
-
-    def _free_block_id(self, block: Block) -> None:
-        """Decrements the refcount of the block. The block may be in two 
-        possible states: (1) immutable/cached or (2) mutable/hashless. 
-        In the first case, the refcount is decremented directly and the block
-        may be possibly added to the evictor. In other case, hashless 
-        allocator free(..) with keep_block_object=True is called to only free
-        the block id (since the block object may be reused by the caller)
-        """
-        block_id = block.block_id
-        assert block_id is not None, "Freeing unallocated block is undefined"
-
-        if block.content_hash is not None:
-            # Immutable: This type of block is always cached, and we want to
-            # keep it in the evictor for future reuse
-            self._decr_refcount_cached_block(block)
-        else:
-            # Mutable: This type of block is not cached, so we release it
-            # directly to the hashless allocator
-            self._decr_refcount_hashless_block(block)
-
-        assert block.block_id is None
-
-    def free(self, block: Block, keep_block_object: bool = False) -> None:
-        """Release the block (look at free_block_id(..) docs)
-        """
-        # Release the physical block index
-        self._free_block_id(block)
-
-        # Release the block object to the pool
-        if not keep_block_object:
-            self._block_pool.free_block(block)
-
-    def fork(self, last_block: Block) -> List[Block]:
-        """Creates a new sequence of blocks that shares the same underlying
-        memory as the original sequence.
-
-        Args:
-            last_block (Block): The last block in the original sequence.
-
-        Returns:
-            List[Block]: The new sequence of blocks that shares the same memory
-                as the original sequence.
-        """
-        source_blocks = get_all_blocks_recursively(last_block)
-
-        forked_blocks: List[Block] = []
-        prev_block = None
-        for block in source_blocks:
-            block_id = block.block_id
-            assert block_id is not None
-
-            refcount = self._refcounter.incr(block_id)
-            assert refcount != 1, "can't fork free'd block_id = {}".format(
-                block_id)
-
-            forked_block = self._block_pool.init_block(
-                prev_block=prev_block,
-                token_ids=block.token_ids,
-                block_size=self._block_size,
-                physical_block_id=block_id,
-                extra_hash=block.extra_hash)
-
-            forked_blocks.append(forked_block)
-            prev_block = forked_blocks[-1]
-
-        return forked_blocks
-
-    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
-        assert device is None
-        # The number of free blocks is the number of hashless free blocks
-        # plus the number of blocks evictor could free from its list.
-        return self._hashless_allocator.get_num_free_blocks(
-        ) + self.evictor.num_blocks
-
-    def get_num_total_blocks(self) -> int:
-        return self._hashless_allocator.get_num_total_blocks()
-
-    def get_physical_block_id(self, absolute_id: int) -> int:
-        """Returns the zero-offset block id on certain block allocator
-        given the absolute block id.
-
-        Args:
-            absolute_id (int): The absolute block id for the block 
-                in whole allocator.
-
-        Returns:
-            int: The rzero-offset block id on certain device.
-        """
-        return sorted(self.all_block_ids).index(absolute_id)
-
-    @property
-    def all_block_ids(self) -> FrozenSet[int]:
-        return self._hashless_allocator.all_block_ids
-
-    def get_prefix_cache_hit_rate(self) -> float:
-        return self.metric_data.get_hit_rate()
-
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache. This function may be used in RLHF
-        flows to invalid prefix caching after the weights are updated,
-        or used for resetting prefix caching status for benchmarking.
-
-        Returns:
-            bool: True if the prefix cache is successfully reset,
-            False otherwise.
-        """
-        num_used_blocks = (self.get_num_total_blocks() -
-                           self.get_num_free_blocks())
-        if num_used_blocks > 0:
-            logger.warning(
-                "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks)
-            return False
-
-        # Free all blocks in the evictor.
-        while (block_id :=
-               self._maybe_allocate_evicted_block_id()) is not None:
-            self._hashless_allocator.free_block_id(block_id)
-
-        # Should not have any cached blocks because all blocks are evicted.
-        assert not self._cached_blocks
-
-        # Reset the evictor.
-        self.evictor = make_evictor(self.eviction_policy)
-
-        # Reset the block tracker.
-        for block_id in self._block_tracker:
-            self._block_tracker[block_id] = BlockTracker()
-
-        # Reset the metrics.
-        self.metric_data = CacheMetricData()
-
-        logger.info("Successfully reset prefix cache")
-        return True
-
-    def is_block_cached(self, block: Block) -> bool:
-        assert block.content_hash is not None
-        return block.content_hash in self._cached_blocks
-
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-        """Once a mutable block is full, it can be promoted to an immutable
-        block. This means that its content can be referenced by future blocks
-        having the same prefix.
-
-        Note that if we already have a cached block with the same content, we
-        will replace the newly-promoted block's mapping with the existing cached
-        block id.
-
-        Args:
-            block: The mutable block to be promoted.
-
-        Returns:
-            BlockId: Either the original block index, or the block index of
-                the previously cached block matching the same content.
-        """
-        # Ensure block can be promoted
-        assert block.content_hash is not None
-        assert block.block_id is not None
-        assert self._refcounter.get(block.block_id) > 0
-
-        if block.content_hash not in self._cached_blocks:
-            # No cached content hash => Set this block as cached.
-            # Note that this block cannot be marked as computed yet
-            # because other sequences in the same batch cannot reuse
-            # this block.
-            self._cached_blocks[block.content_hash] = block.block_id
-            # Mark this block as touched so that it can be marked as
-            # computed after the entire batch of sequences are scheduled.
-            self._touched_blocks.add(block.block_id)
-            return block.block_id
-
-        # Reuse the cached content hash
-        self._decr_refcount_hashless_block(block)
-        block.block_id = self._cached_blocks[block.content_hash]
-
-        # Increment refcount of the cached block and (possibly) restore
-        # it from the evictor.
-        # Note that in this case, the block is marked as computed
-        self._incr_refcount_cached_block(block)
-
-        return block.block_id
-
-    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
-        """Performs a copy-on-write operation on the given block if it is not
-        appendable.
-
-        Args:
-            block (Block): The block to check for copy-on-write.
-
-        Returns:
-            BlockId: The block index of the new block if a copy-on-write 
-                operation was performed, or the original block index if
-                no copy-on-write was necessary.
-        """
-        src_block_id = block.block_id
-        assert src_block_id is not None
-
-        if self._cow_tracker.is_appendable(block):
-            return src_block_id
-
-        self._free_block_id(block)
-        trg_block_id = self._allocate_block_id()
-
-        self._cow_tracker.record_cow(src_block_id, trg_block_id)
-
-        return trg_block_id
-
-    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
-        """Returns the copy-on-write source->destination mapping and clears it.
-
-        Returns:
-            List[Tuple[BlockId, BlockId]]: A list mapping source
-                block indices to destination block indices.
-        """
-        return self._cow_tracker.clear_cows()
-
-    def mark_blocks_as_accessed(self, block_ids: List[int],
-                                now: float) -> None:
-        """Mark blocks as accessed, used in prefix caching.
-
-        If the block is added into evictor, we need to update corresponding
-        info in evictor's metadata.
-        """
-
-        for block_id in block_ids:
-            if self._block_tracker[block_id].active:
-                self._block_tracker[block_id].last_accessed = now
-            elif block_id in self.evictor:
-                self.evictor.update(block_id, now)
-            else:
-                raise ValueError(
-                    "Mark block as accessed which is not belonged to GPU")
-
-    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
-        # Mark all touched blocks as computed.
-        for block_id in self._touched_blocks:
-            self._block_tracker[block_id].computed = True
-        self._touched_blocks.clear()
-
-    def _track_block_id(self, block_id: Optional[BlockId],
-                        computed: bool) -> None:
-        assert block_id is not None
-        self._block_tracker[block_id].enable()
-        self._block_tracker[block_id].computed = computed
-
-    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
-        assert block_id is not None
-        self._block_tracker[block_id].disable()
-
-    def block_is_computed(self, block_id: int) -> bool:
-        if self._block_tracker[block_id].active:
-            return self._block_tracker[block_id].computed
-        else:
-            return block_id in self.evictor
-
-    def get_common_computed_block_ids(
-            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
-        """Return the block ids that are common for a given sequence group.
-
-        Only those blocks that are immutable and already be marked
-        compyted would be taken consideration.
-        """
-
-        # NOTE We exclude the last block to avoid the case where the entire
-        # prompt is cached. This would cause erroneous behavior in model
-        # runner.
-
-        # It returns a list of int although type annotation says list of string.
-        if len(computed_seq_block_ids) == 1:
-            return computed_seq_block_ids[0]
-
-        return commonprefix([
-            ids for ids in computed_seq_block_ids  # type: ignore
-            if ids
-        ])
-
-    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
-        """Returns the number of full blocks that will be touched by
-        swapping in/out.
-
-        Args:
-            blocks: List of blocks to be swapped.
-        Returns:
-            int: the number of full blocks that will be touched by
-                swapping in/out the given blocks. Non full blocks are ignored
-                when deciding the number of blocks to touch.
-        """
-        num_touched_blocks: int = 0
-        for block in blocks:
-            # If the block has a match in the cache and the cached
-            # block is not referenced, then we still count it as a
-            # touched block
-            if block.is_full and (not self.is_block_cached(block) or \
-                (block.content_hash is not None and \
-                self._cached_blocks[block.content_hash] in \
-                        self.evictor)):
-                num_touched_blocks += 1
-        return num_touched_blocks
-
-    def swap_out(self, blocks: List[Block]) -> None:
-        """Execute the swap out actions. Basically just free the 
-        given blocks.
-
-        Args:
-            blocks: List of blocks to be swapped out.
-        """
-        for block in blocks:
-            self._free_block_id(block)
-
-    def swap_in(self, blocks: List[Block]) -> None:
-        """Execute the swap in actions. Change the block id from 
-        old allocator to current allocator for each block to finish 
-        the block table update. 
-
-        Args:
-            blocks: List of blocks to be swapped in.
-        """
-        for block in blocks:
-            # Here we allocate either immutable or mutable block and then
-            # extract its block_id. Note that the block object is released
-            # and the block_id is assigned to "block" to allow reusing the
-            # existing "block" object
-            if block.is_full:
-                tmp_block = self.allocate_immutable_block(
-                    prev_block=block.prev_block,
-                    token_ids=block.token_ids,
-                    extra_hash=block.extra_hash)
-            else:
-                tmp_block = self.allocate_mutable_block(
-                    prev_block=block.prev_block, extra_hash=block.extra_hash)
-                tmp_block.append_token_ids(block.token_ids)
-
-            block_id = tmp_block.block_id
-            self._block_pool.free_block(tmp_block)
-
-            block.block_id = block_id  # Assign block_id
-
-    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
-        """
-        Given a list of block hashes, return the prefix of the block hashes that
-        are all cached.
-
-        Since a block's block hash includes the hashes of all previous blocks,
-        and we only allocate/deallocate blocks in the entire sequence, so if a
-        block is cached, then all previous blocks are also cached. With this
-        property, we can use binary search to find the prefix of cached blocks.
-
-        Args:
-            block_hashes (List[int]): The list of block hashes.
-
-        Returns:
-            List[int]: The prefix of the `block_hashes` that are cached.
-        """
-
-        def _block_is_cached(block_hash: PrefixHash) -> bool:
-            if block_hash not in self._cached_blocks:
-                return False
-
-            cached_block_id = self._cached_blocks[block_hash]
-            # We only consider the blocks that are marked as computed.
-            return self.block_is_computed(cached_block_id)
-
-        def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
-
-            # python <= 3.10 don't have the key argument
-            if sys.version_info < (3, 10):
-                a = [key(e) for e in a]
-                return bisect_left(a, x)
-            else:
-                return bisect_left(a, x, key=key)
-
-        # Look for the first block that's not cached, and returns the prefix
-        # i.e. blocks that are cached.
-        idx = _bisect_left(block_hashes,
-                           True,
-                           key=lambda x: not _block_is_cached(x))
-        return block_hashes[:idx]
-
-
-class PrefixCachingBlock(Block):
-    """A block implementation that supports prefix caching.
-
-    The PrefixCachingBlock class represents a block of token IDs with prefix
-    caching capabilities. It wraps a NaiveBlock internally and provides
-    additional functionality for content hashing and promoting immutable blocks
-    with the prefix caching allocator.
-
-    Args:
-        prev_block (Optional[PrefixCachingBlock]): The previous block in the
-            sequence.
-        token_ids (List[int]): The initial token IDs to be stored in the block.
-        block_size (int): The maximum number of token IDs that can be stored in
-            the block.
-        allocator (BlockAllocator): The prefix
-            caching block allocator associated with this block.
-        block_id (Optional[int], optional): The physical block index
-            of this block. Defaults to None.
-        extra_hash (Optional[int]): The hash value of additional factors
-            such as adapters that influence the block, apart from the token_ids.
-    """
-
-    # Note that we use 'None' as a string here instead of None because
-    # as of Python 3.12, hash(None) returns a constant predictable value.
-    # This could possibly make it easier to find and exploit hash
-    # collisions. 'None' as a string will be hashed differently per process,
-    # but consistently within the same process. This is the same as the
-    # behavior of None prior to Python 3.12.
-    _none_hash: int = hash('None')
-
-    def __init__(
-        self,
-        prev_block: Optional[Block],
-        token_ids: List[int],
-        block_size: int,
-        allocator: BlockAllocator,
-        block_id: Optional[int] = None,
-        computed: bool = False,
-        extra_hash: Optional[int] = None,
-    ):
-        assert isinstance(allocator, PrefixCachingBlockAllocator), (
-            "Currently this class is only tested with "
-            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
-                allocator))
-        assert_prefix_caching_block_or_none(prev_block)
-
-        self._prev_block = prev_block
-        self._cached_content_hash: Optional[int] = None
-        self._cached_num_tokens_total: int = 0
-        self._allocator = allocator
-        self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
-        self._computed = computed
-        self._extra_hash = extra_hash
-
-        # On the first time, we create the block object, and next we only
-        # reinitialize it
-        if hasattr(self, "_block"):
-            self._block.__init__(  # type: ignore[has-type]
-                prev_block=prev_block,
-                token_ids=token_ids,
-                block_size=block_size,
-                block_id=block_id,
-                allocator=self._allocator)
-        else:
-            self._block = NaiveBlock(prev_block=prev_block,
-                                     token_ids=token_ids,
-                                     block_size=block_size,
-                                     block_id=block_id,
-                                     allocator=self._allocator)
-
-        self._update_num_tokens_total()
-
-    def _update_num_tokens_total(self):
-        """Incrementally computes the number of tokens that there is
-        till the current block (included)
-        """
-        res = 0
-
-        # Add all previous blocks
-        if self._prev_block is not None:
-            res += self._prev_block.num_tokens_total
-
-        # Add current block
-        res += len(self.token_ids)
-
-        self._cached_num_tokens_total = res
-
-    @property
-    def computed(self) -> bool:
-        return self._computed
-
-    @computed.setter
-    def computed(self, value) -> None:
-        self._computed = value
-
-    @property
-    def last_accessed(self) -> float:
-        return self._last_accessed
-
-    @last_accessed.setter
-    def last_accessed(self, last_accessed_ts: float):
-        self._last_accessed = last_accessed_ts
-
-    def append_token_ids(self, token_ids: List[int]) -> None:
-        """Appends the given token IDs to the block and registers the block as
-        immutable if the block becomes full.
-
-        Args:
-            token_ids (List[int]): The token IDs to be appended to the block.
-        """
-        # Ensure this is mutable block (not promoted)
-        assert self.content_hash is None
-        assert not self.computed
-
-        if len(token_ids) == 0:
-            return
-
-        # Ensure there are input tokens
-        assert token_ids, "Got token_ids = {}".format(token_ids)
-
-        # Naive block handles CoW.
-        self._block.append_token_ids(token_ids)
-        self._update_num_tokens_total()
-
-        # If the content hash is present, then the block can be made immutable.
-        # Register ourselves with the allocator, potentially replacing the
-        # physical block index.
-        if self.content_hash is not None:
-            self.block_id = self._allocator.promote_to_immutable_block(self)
-
-    @property
-    def block_id(self) -> Optional[int]:
-        return self._block.block_id
-
-    @block_id.setter
-    def block_id(self, value) -> None:
-        self._block.block_id = value
-
-    @property
-    def is_full(self) -> bool:
-        return self._block.is_full
-
-    @property
-    def num_empty_slots(self) -> int:
-        return self._block.num_empty_slots
-
-    @property
-    def num_tokens_total(self) -> int:
-        return self._cached_num_tokens_total
-
-    @property
-    def block_size(self) -> int:
-        return self._block.block_size
-
-    @property
-    def token_ids(self) -> List[int]:
-        return self._block.token_ids
-
-    @property
-    def prev_block(self) -> Optional[Block]:
-        return self._prev_block
-
-    @property
-    def extra_hash(self) -> Optional[int]:
-        return self._extra_hash
-
-    @property
-    def content_hash(self) -> Optional[int]:
-        """Return the content-based hash of the current block, or None if it is
-        not yet defined.
-
-        For the content-based hash to be defined, the current block must be
-        full.
-        """
-        # If the hash is already computed, return it.
-        if self._cached_content_hash is not None:
-            return self._cached_content_hash
-
-        # We cannot compute a hash for the current block because it is not full.
-        if not self.is_full:
-            return None
-
-        is_first_block = self._prev_block is None
-        prev_block_hash = (
-            self._none_hash if is_first_block else
-            self._prev_block.content_hash  # type: ignore
-        )
-
-        # Previous block exists but does not yet have a hash.
-        # Return no hash in this case.
-        if prev_block_hash == self._none_hash and not is_first_block:
-            return None
-
-        self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
-            is_first_block,
-            prev_block_hash,
-            cur_block_token_ids=self.token_ids,
-            extra_hash=self._extra_hash)
-        return self._cached_content_hash
-
-    @classmethod
-    def hash_block_tokens(cls,
-                          is_first_block: bool,
-                          prev_block_hash: Optional[int],
-                          cur_block_token_ids: List[int],
-                          extra_hash: Optional[int] = None) -> int:
-        """Computes a hash value corresponding to the contents of a block and
-        the contents of the preceding block(s). The hash value is used for
-        prefix caching.
-
-        Parameters:
-        - is_first_block (bool): A flag indicating if the block is the first in
-            the sequence.
-        - prev_block_hash (Optional[int]): The hash of the previous block. None
-            if this is the first block.
-        - cur_block_token_ids (List[int]): A list of token ids in the current
-            block. The current block is assumed to be full.
-        - extra_hash (Optional[int]): The hash value of additional factors
-            such as adapters that influence the block, apart from the token_ids.
-
-        Returns:
-        - int: The computed hash value for the block.
-        """
-        if is_first_block and prev_block_hash is None:
-            prev_block_hash = cls._none_hash
-        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
-                     extra_hash))
-
-
-class ComputedBlocksTracker:
-    """
-    Tracks the computed blocks for each sequence.
-
-    Internally, it maintains a map from sequence id to the list of block hashes
-    for the sequence. We cache the hashes of the full blocks for each sequence,
-    and make sure the hash is calculated in the same way as the allocator.
-    When a sequence is being decoded, we also update the sequence's hash
-    accordingly and incrementally.
-
-    From the sequence hash, with prefix caching enabled, we could also calculate
-    the number of cached tokens for the sequence by looking up the number of
-    cached block hashes in the allocator.
-    """
-
-    # Note that we use 'None' as a string here instead of None because
-    # as of Python 3.12, hash(None) returns a constant predictable value.
-    # This could possibly make it easier to find and exploit hash
-    # collisions. 'None' as a string will be hashed differently per process,
-    # but consistently within the same process. This is the same as the
-    # behavior of None prior to Python 3.12.
-    _none_hash: int = hash('None')
-
-    def __init__(
-        self,
-        allocator: DeviceAwareBlockAllocator,
-        block_size: int,
-        enable_caching: bool,
-    ):
-        self._allocator = allocator
-        self._block_size = block_size
-        self._enable_caching = enable_caching
-
-        # A map from seq_id to the list of block hashes for the
-        # sequence. This is so that we don't have to recompute the block hashes
-        # for the sequence when we need to check if the sequence is cached.
-        # Note a block that's not full will not have its hash calculated and
-        # recorded.
-        self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
-
-        # A map from seq_id to the number of tokens that are cached for the
-        # sequence.
-        # We need this so that a sequence in continuous prefill doesn't
-        # accidentally see its cached token count change. See comments in
-        # `get_num_cached_tokens` for more details.
-        self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
-
-    def _update_seq_hashes(self, seq: Sequence) -> None:
-        """Incrementally update the sequence's block hashes and record them."""
-        assert self._enable_caching
-
-        block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
-            seq.seq_id, [])
-        cur_num_blocks_recorded = len(block_hashes_recorded)
-        token_ids = seq.get_token_ids()
-        assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
-            f"The sequence has {len(token_ids)} tokens, but"
-            f" already recorded {cur_num_blocks_recorded} blocks. "
-            "This should not happen since we assume blocks are "
-            "only appended other than recomputation. When the sequence is "
-            "recomputed, we should have removed the info of the old blocks.")
-        # Update the computed block hashes for the sequence. Since only full
-        # blocks are considered as "computed", we take floor here.
-        num_computed_blocks = len(token_ids) // self._block_size
-
-        # We need to know the hash of the previous block to compute the hash of
-        # the current block so that blocks could be uniquely identified across
-        # sequences of prefixes.
-        prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
-                           block_hashes_recorded[-1])
-        # Only update the computed block hashes for the new blocks
-        for i in range(cur_num_blocks_recorded, num_computed_blocks):
-            assert len(token_ids) >= (i + 1) * self._block_size
-            block_token_ids = token_ids[i * self._block_size:(i + 1) *
-                                        self._block_size]
-
-            # NOTE: If there are any factors affecting the block besides
-            # token_ids, they should be added as input to extra_hash.
-            extra_hash = seq.extra_hash()
-
-            # This has to be kept in sync with the allocator's hash
-            # calculation.
-            block_hash = PrefixCachingBlock.hash_block_tokens(
-                is_first_block=prev_block_hash == self._none_hash,
-                prev_block_hash=prev_block_hash,
-                cur_block_token_ids=block_token_ids,
-                extra_hash=extra_hash,
-            )
-            block_hashes_recorded.append(block_hash)
-            prev_block_hash = block_hash
-
-        self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
-
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        if not self._enable_caching:
-            return 0
-
-        # We always try to update the sequence hashes on the fly.
-        # This is to ensure that we don't miss any cached tokens for the
-        # sequence during decode.
-        # This routine should only update hash for any new blocks too.
-        self._update_seq_hashes(seq)
-
-        num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
-            seq.seq_id, None)
-
-        # TODO(rickyx): This hack could be removed once we mark blocks as
-        # computed correctly with chunked prefills.
-        if num_computed_tokens_prev is not None and seq.is_prefill():
-            # For a sequence that is still in prefill, we don't
-            # recompute the number of cached tokens.
-            # This also handles correctly chunked prefill since currently
-            # we mark blocks as computed even if the sequence is still partially
-            # prefilled. So a continuously prefilled sequence should not
-            # see its cached token count change while running.
-            return num_computed_tokens_prev
-
-        block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
-
-        # This is O(logN), where N is the number of blocks.
-        num_cached_blocks = len(
-            self._allocator.find_cached_blocks_prefix(block_hashes))
-        num_cached_tokens = num_cached_blocks * self._block_size
-        self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
-        return num_cached_tokens
-
-    def remove_seq(self, seq_id: int) -> None:
-        """Stop tracking the sequence."""
-        if not self._enable_caching:
-            return
-        assert seq_id in self._seq_id_to_blocks_hashes
-        del self._seq_id_to_blocks_hashes[seq_id]
-
-        assert seq_id in self._seq_id_to_num_tokens_computed
-        del self._seq_id_to_num_tokens_computed[seq_id]
-
-
-class LastAccessBlocksTracker:
-    """Manages the last access time of the tracked sequences, in order to allow
-    an efficient update of allocator's block last access times
-    """
-
-    def __init__(self, allocator):
-        self._allocator = allocator
-        self._seq_last_access: Dict[int, Optional[float]] = {}
-
-    def add_seq(self, seq_id: int) -> None:
-        """Start tracking seq_id
-        """
-        assert seq_id not in self._seq_last_access
-        self._seq_last_access[seq_id] = None
-
-    def remove_seq(self, seq_id: int) -> None:
-        """Stop tracking seq_id
-        """
-        assert seq_id in self._seq_last_access
-        del self._seq_last_access[seq_id]
-
-    def update_last_access(self, seq_id: int, time: float) -> None:
-        assert seq_id in self._seq_last_access
-        self._seq_last_access[seq_id] = time
-
-    def update_seq_blocks_last_access(self, seq_id: int,
-                                      block_ids: List[int]) -> None:
-        assert seq_id in self._seq_last_access
-
-        ts = self._seq_last_access[seq_id]
-
-        if ts is None:
-            # No last access was recorded, no need to update.
-            return
-
-        self._allocator.mark_blocks_as_accessed(block_ids, ts)
-
-
-def assert_prefix_caching_block_or_none(block: Optional[Block]):
-    if block is None:
-        return
-    assert isinstance(block,
-                      PrefixCachingBlock), "Got block = {}".format(block)
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
deleted file mode 100644
index e933c6ee7c8b..000000000000
--- a/vllm/core/block/utils.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Block manager utils."""
-from vllm.sequence import SequenceGroup
-from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                        STR_NOT_IMPL_ENC_DEC_SWA)
-
-
-def check_no_caching_or_swa_for_blockmgr_encdec(
-        block_mgr, seq_group: SequenceGroup) -> None:
-    '''
-    Enforce that prefix caching & sliding-window attention (SWA)
-    are currently unsupported *specifically* for encoder/decoder models.
-
-    Raises NotImplementedError if unsupported scenario is detected.
-
-    Arguments:
-
-    * block_mgr: BlockSpaceManager instance
-    * seq_group: SequenceGroup passed to block_mgr
-    '''
-
-    if seq_group.is_encoder_decoder():
-        if block_mgr.max_block_sliding_window is not None:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
-
-        if block_mgr.enable_caching:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
deleted file mode 100644
index 4ec5a775f465..000000000000
--- a/vllm/core/block_manager.py
+++ /dev/null
@@ -1,525 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A block manager that manages token blocks."""
-from typing import Dict, List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple
-
-from vllm.core.block.block_table import BlockTable
-from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.interfaces import Block
-from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
-                                                  LastAccessBlocksTracker)
-from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
-from vllm.utils import Device
-
-SeqId = int
-EncoderSeqId = str
-
-
-class SelfAttnBlockSpaceManager(BlockSpaceManager):
-    """BlockSpaceManager which manages the allocation of KV cache.
-
-    It owns responsibility for allocation, swapping, allocating memory for
-    autoregressively-generated tokens, and other advanced features such as
-    prefix caching, forking/copy-on-write, and sliding-window memory allocation.
-
-    This class implements the design described in
-    https://github.com/vllm-project/vllm/pull/3492.
-
-    Lookahead slots
-        The block manager has the notion of a "lookahead slot". These are slots
-        in the KV cache that are allocated for a sequence. Unlike the other
-        allocated slots, the content of these slots is undefined -- the worker
-        may use the memory allocations in any way.
-
-        In practice, a worker could use these lookahead slots to run multiple
-        forward passes for a single scheduler invocation. Each successive
-        forward pass would write KV activations to the corresponding lookahead
-        slot. This allows low inter-token latency use-cases, where the overhead
-        of continuous batching scheduling is amortized over >1 generated tokens.
-
-        Speculative decoding uses lookahead slots to store KV activations of
-        proposal tokens.
-
-        See https://github.com/vllm-project/vllm/pull/3250 for more information
-        on lookahead scheduling.
-
-    Args:
-        block_size (int): The size of each memory block.
-        num_gpu_blocks (int): The number of memory blocks allocated on GPU.
-        num_cpu_blocks (int): The number of memory blocks allocated on CPU.
-        watermark (float, optional): The threshold used for memory swapping.
-            Defaults to 0.01.
-        sliding_window (Optional[int], optional): The size of the sliding
-            window. Defaults to None.
-        enable_caching (bool, optional): Flag indicating whether caching is
-            enabled. Defaults to False.
-    """
-
-    def __init__(
-        self,
-        block_size: int,
-        num_gpu_blocks: int,
-        num_cpu_blocks: int,
-        watermark: float = 0.01,
-        sliding_window: Optional[int] = None,
-        enable_caching: bool = False,
-    ) -> None:
-        self.block_size = block_size
-        self.num_total_gpu_blocks = num_gpu_blocks
-        self.num_total_cpu_blocks = num_cpu_blocks
-
-        self.sliding_window = sliding_window
-        # max_block_sliding_window is the max number of blocks that need to be
-        # allocated
-        self.max_block_sliding_window = None
-        if sliding_window is not None:
-            # +1 here because // rounds down
-            num_blocks = sliding_window // block_size + 1
-            # +1 here because the last block may not be full,
-            # and so the sequence stretches one more block at the beginning
-            # For example, if sliding_window is 3 and block_size is 4,
-            # we may need 2 blocks when the second block only holds 1 token.
-            self.max_block_sliding_window = num_blocks + 1
-
-        self.watermark = watermark
-        assert watermark >= 0.0
-
-        self.enable_caching = enable_caching
-
-        self.watermark_blocks = int(watermark * num_gpu_blocks)
-
-        self.block_allocator = CpuGpuBlockAllocator.create(
-            allocator_type="prefix_caching" if enable_caching else "naive",
-            num_gpu_blocks=num_gpu_blocks,
-            num_cpu_blocks=num_cpu_blocks,
-            block_size=block_size,
-        )
-
-        self.block_tables: Dict[SeqId, BlockTable] = {}
-        self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
-
-        self._computed_blocks_tracker = ComputedBlocksTracker(
-            self.block_allocator, self.block_size, self.enable_caching)
-        self._last_access_blocks_tracker = LastAccessBlocksTracker(
-            self.block_allocator)
-
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        # FIXME(woosuk): Here we assume that all sequences in the group share
-        # the same prompt. This may not be true for preempted sequences.
-
-        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
-
-        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-        num_required_blocks = BlockTable.get_num_required_blocks(
-            seq.get_token_ids(),
-            block_size=self.block_size,
-            num_lookahead_slots=num_lookahead_slots,
-        )
-
-        if seq_group.is_encoder_decoder():
-            encoder_seq = seq_group.get_encoder_seq()
-            assert encoder_seq is not None
-            num_required_blocks += BlockTable.get_num_required_blocks(
-                encoder_seq.get_token_ids(),
-                block_size=self.block_size,
-            )
-
-        if self.max_block_sliding_window is not None:
-            num_required_blocks = min(num_required_blocks,
-                                      self.max_block_sliding_window)
-
-        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
-            device=Device.GPU)
-
-        # Use watermark to avoid frequent cache eviction.
-        if (self.num_total_gpu_blocks - num_required_blocks
-                < self.watermark_blocks):
-            return AllocStatus.NEVER
-        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
-            return AllocStatus.OK
-        else:
-            return AllocStatus.LATER
-
-    def _allocate_sequence(self, seq: Sequence) -> BlockTable:
-        block_table = BlockTable(
-            block_size=self.block_size,
-            block_allocator=self.block_allocator,
-            max_block_sliding_window=self.max_block_sliding_window,
-        )
-        if seq.get_token_ids():
-            # NOTE: If there are any factors affecting the block besides
-            # token_ids, they should be added as input to extra_hash.
-            extra_hash = seq.extra_hash()
-
-            # Add blocks to the block table only if the sequence is non empty.
-            block_table.allocate(token_ids=seq.get_token_ids(),
-                                 extra_hash=extra_hash)
-
-        return block_table
-
-    def allocate(self, seq_group: SequenceGroup) -> None:
-
-        # Allocate self-attention block tables for decoder sequences
-        waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
-        assert not (set(seq.seq_id for seq in waiting_seqs)
-                    & self.block_tables.keys()), "block table already exists"
-
-        # NOTE: Here we assume that all sequences in the group have the same
-        # prompt.
-        seq = waiting_seqs[0]
-        block_table: BlockTable = self._allocate_sequence(seq)
-        self.block_tables[seq.seq_id] = block_table
-
-        # Track seq
-        self._last_access_blocks_tracker.add_seq(seq.seq_id)
-
-        # Assign the block table for each sequence.
-        for seq in waiting_seqs[1:]:
-            self.block_tables[seq.seq_id] = block_table.fork()
-
-            # Track seq
-            self._last_access_blocks_tracker.add_seq(seq.seq_id)
-
-        # Allocate cross-attention block table for encoder sequence
-        #
-        # NOTE: Here we assume that all sequences in the group have the same
-        # encoder prompt.
-        request_id = seq_group.request_id
-
-        assert (request_id
-                not in self.cross_block_tables), \
-            "block table already exists"
-
-        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
-
-        if seq_group.is_encoder_decoder():
-            encoder_seq = seq_group.get_encoder_seq()
-            assert encoder_seq is not None
-            block_table = self._allocate_sequence(encoder_seq)
-            self.cross_block_tables[request_id] = block_table
-
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        """Determine if there is enough space in the GPU KV cache to continue
-        generation of the specified sequence group.
-
-        We use a worst-case heuristic: assume each touched block will require a
-        new allocation (either via CoW or new block). We can append slots if the
-        number of touched blocks is less than the number of free blocks.
-
-        "Lookahead slots" are slots that are allocated in addition to the slots
-        for known tokens. The contents of the lookahead slots are not defined.
-        This is used by speculative decoding when speculating future tokens.
-        """
-
-        num_touched_blocks = 0
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            block_table = self.block_tables[seq.seq_id]
-
-            num_touched_blocks += (
-                block_table.get_num_blocks_touched_by_append_slots(
-                    token_ids=block_table.get_unseen_token_ids(
-                        seq.get_token_ids()),
-                    num_lookahead_slots=num_lookahead_slots,
-                ))
-
-        num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
-            Device.GPU)
-        return num_touched_blocks <= num_free_gpu_blocks
-
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-
-        block_table = self.block_tables[seq.seq_id]
-
-        block_table.append_token_ids(
-            token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
-            num_lookahead_slots=num_lookahead_slots,
-            num_computed_slots=seq.data.get_num_computed_tokens(),
-            extra_hash=seq.extra_hash(),
-        )
-        # Return any new copy-on-writes.
-        new_cows = self.block_allocator.clear_copy_on_writes()
-        return new_cows
-
-    def free(self, seq: Sequence) -> None:
-        seq_id = seq.seq_id
-
-        if seq_id not in self.block_tables:
-            # Already freed or haven't been scheduled yet.
-            return
-
-        # Update seq block ids with the latest access time
-        self._last_access_blocks_tracker.update_seq_blocks_last_access(
-            seq_id, self.block_tables[seq.seq_id].physical_block_ids)
-
-        # Untrack seq
-        self._last_access_blocks_tracker.remove_seq(seq_id)
-        self._computed_blocks_tracker.remove_seq(seq_id)
-
-        # Free table/blocks
-        self.block_tables[seq_id].free()
-        del self.block_tables[seq_id]
-
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        seq_id = seq.seq_id
-        self._computed_blocks_tracker.remove_seq(seq_id)
-
-    def free_cross(self, seq_group: SequenceGroup) -> None:
-        request_id = seq_group.request_id
-        if request_id not in self.cross_block_tables:
-            # Already freed or hasn't been scheduled yet.
-            return
-        self.cross_block_tables[request_id].free()
-        del self.cross_block_tables[request_id]
-
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        block_ids = self.block_tables[seq.seq_id].physical_block_ids
-        return block_ids  # type: ignore
-
-    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
-        request_id = seq_group.request_id
-        assert request_id in self.cross_block_tables
-        block_ids = self.cross_block_tables[request_id].physical_block_ids
-        assert all(b is not None for b in block_ids)
-        return block_ids  # type: ignore
-
-    def access_all_blocks_in_seq(self, seq: Sequence, now: float):
-        if self.enable_caching:
-            # Record the latest access time for the sequence. The actual update
-            # of the block ids is deferred to the sequence free(..) call, since
-            # only during freeing of block ids, the blocks are actually added to
-            # the evictor (which is when the most updated time is required)
-            # (This avoids expensive calls to mark_blocks_as_accessed(..))
-            self._last_access_blocks_tracker.update_last_access(
-                seq.seq_id, now)
-
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        # If prefix caching is enabled, mark immutable blocks as computed
-        # right after they have been scheduled (for prefill). This assumes
-        # the scheduler is synchronous so blocks are actually computed when
-        # scheduling the next batch.
-        self.block_allocator.mark_blocks_as_computed([])
-
-    def get_common_computed_block_ids(
-            self, seqs: List[Sequence]) -> GenericSequence[int]:
-        """Determine which blocks for which we skip prefill.
-
-        With prefix caching we can skip prefill for previously-generated blocks.
-        Currently, the attention implementation only supports skipping cached
-        blocks if they are a contiguous prefix of cached blocks.
-
-        This method determines which blocks can be safely skipped for all
-        sequences in the sequence group.
-        """
-        computed_seq_block_ids = []
-        for seq in seqs:
-            all_blocks = self.block_tables[seq.seq_id].physical_block_ids
-            num_cached_tokens = (
-                self._computed_blocks_tracker.get_num_cached_tokens(seq))
-            assert num_cached_tokens % self.block_size == 0
-            num_cached_blocks = num_cached_tokens // self.block_size
-            computed_block_ids = all_blocks[:num_cached_blocks]
-            computed_seq_block_ids.append(computed_block_ids)
-
-        # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
-        return self.block_allocator.get_common_computed_block_ids(
-            computed_seq_block_ids)  # type: ignore
-
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        if parent_seq.seq_id not in self.block_tables:
-            # Parent sequence has either been freed or never existed.
-            return
-        src_block_table = self.block_tables[parent_seq.seq_id]
-        self.block_tables[child_seq.seq_id] = src_block_table.fork()
-
-        # Track child seq
-        self._last_access_blocks_tracker.add_seq(child_seq.seq_id)
-
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        """Returns the AllocStatus for the given sequence_group 
-        with num_lookahead_slots.
-
-        Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
-
-        Returns:
-            AllocStatus: The AllocStatus for the given sequence group.
-        """
-        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
-                              num_lookahead_slots)
-
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        """Returns the block id mapping (from CPU to GPU) generated by
-        swapping in the given seq_group with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap in.
-
-        Returns:
-            List[Tuple[int, int]]: The mapping of swapping block from CPU 
-                to GPU.
-        """
-        physical_block_id_mapping = []
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            blocks = self.block_tables[seq.seq_id].blocks
-            if len(blocks) == 0:
-                continue
-
-            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
-                                                         src_device=Device.CPU,
-                                                         dst_device=Device.GPU)
-
-            # Refresh the block ids of the table (post-swap)
-            self.block_tables[seq.seq_id].update(blocks)
-
-            seq_physical_block_id_mapping = {
-                self.block_allocator.get_physical_block_id(
-                    Device.CPU, cpu_block_id):
-                self.block_allocator.get_physical_block_id(
-                    Device.GPU, gpu_block_id)
-                for cpu_block_id, gpu_block_id in seq_swap_mapping.items()
-            }
-
-            physical_block_id_mapping.extend(
-                list(seq_physical_block_id_mapping.items()))
-
-        return physical_block_id_mapping
-
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        """Returns whether we can swap out the given sequence_group 
-        with num_lookahead_slots.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group to swap out.
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
-
-        Returns:
-            bool: Whether it's possible to swap out current sequence group.
-        """
-        alloc_status = self._can_swap(seq_group, Device.CPU,
-                                      SequenceStatus.RUNNING)
-        return alloc_status == AllocStatus.OK
-
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        """Returns the block id mapping (from GPU to CPU) generated by
-        swapping out the given sequence_group with num_lookahead_slots.
-
-        Args:
-            sequence_group (SequenceGroup): The sequence group to swap out.
-
-        Returns:
-            List[Tuple[int, int]]: The mapping of swapping block from 
-                GPU to CPU.
-        """
-        physical_block_id_mapping = []
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            blocks = self.block_tables[seq.seq_id].blocks
-            if len(blocks) == 0:
-                continue
-
-            seq_swap_mapping = self.block_allocator.swap(blocks=blocks,
-                                                         src_device=Device.GPU,
-                                                         dst_device=Device.CPU)
-
-            # Refresh the block ids of the table (post-swap)
-            self.block_tables[seq.seq_id].update(blocks)
-
-            seq_physical_block_id_mapping = {
-                self.block_allocator.get_physical_block_id(
-                    Device.GPU, gpu_block_id):
-                self.block_allocator.get_physical_block_id(
-                    Device.CPU, cpu_block_id)
-                for gpu_block_id, cpu_block_id in seq_swap_mapping.items()
-            }
-
-            physical_block_id_mapping.extend(
-                list(seq_physical_block_id_mapping.items()))
-
-        return physical_block_id_mapping
-
-    def get_num_free_gpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.GPU)
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.CPU)
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        return self.block_allocator.get_prefix_cache_hit_rate(device)
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return self.block_allocator.reset_prefix_cache(device)
-
-    def _can_swap(self,
-                  seq_group: SequenceGroup,
-                  device: Device,
-                  status: SequenceStatus,
-                  num_lookahead_slots: int = 0) -> AllocStatus:
-        """Returns the AllocStatus for swapping in/out the given sequence_group 
-        on to the 'device'.
-
-        Args:
-            sequence_group (SequenceGroup): The sequence group to swap in/out.
-            device (Device): device to swap the 'seq_group' on.
-            status (SequenceStatus): The status of sequence which is needed
-                for action. RUNNING for swap out and SWAPPED for swap in
-            num_lookahead_slots (int): Number of lookahead slots used in 
-                speculative decoding, default to 0.
-
-        Returns:
-            AllocStatus: The AllocStatus for swapping in/out the given 
-                sequence_group on to the 'device'.
-        """
-        # First determine the number of blocks that will be touched by this
-        # swap. Then verify if there are available blocks in the device
-        # to perform the swap.
-        num_blocks_touched = 0
-        blocks: List[Block] = []
-        for seq in seq_group.get_seqs(status=status):
-            block_table = self.block_tables[seq.seq_id]
-            if block_table.blocks is not None:
-                # Compute the number blocks to touch for the tokens to be
-                # appended. This does NOT include the full blocks that need
-                # to be touched for the swap.
-                num_blocks_touched += \
-                    block_table.get_num_blocks_touched_by_append_slots(
-                        block_table.get_unseen_token_ids(seq.get_token_ids()),
-                        num_lookahead_slots=num_lookahead_slots)
-                blocks.extend(block_table.blocks)
-        # Compute the number of full blocks to touch and add it to the
-        # existing count of blocks to touch.
-        num_blocks_touched += self.block_allocator.get_num_full_blocks_touched(
-            blocks, device=device)
-
-        watermark_blocks = 0
-        if device == Device.GPU:
-            watermark_blocks = self.watermark_blocks
-
-        if self.block_allocator.get_num_total_blocks(
-                device) < num_blocks_touched:
-            return AllocStatus.NEVER
-        elif self.block_allocator.get_num_free_blocks(
-                device) - num_blocks_touched >= watermark_blocks:
-            return AllocStatus.OK
-        else:
-            return AllocStatus.LATER
-
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        """Get the number of tokens in blocks that are already computed and
-        cached in the block manager for the sequence.
-        """
-        return self._computed_blocks_tracker.get_num_cached_tokens(seq)
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
deleted file mode 100644
index 7ec4768e90b1..000000000000
--- a/vllm/core/evictor.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-import heapq
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
-
-
-class EvictionPolicy(enum.Enum):
-    """Enum for eviction policy used by make_evictor to instantiate the correct
-       Evictor subclass.
-    """
-    LRU = enum.auto()
-
-
-class Evictor(ABC):
-    """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed Blocks.
-    """
-
-    @abstractmethod
-    def __init__(self):
-        pass
-
-    @abstractmethod
-    def __contains__(self, block_id: int) -> bool:
-        pass
-
-    @abstractmethod
-    def evict(self) -> Tuple[int, int]:
-        """Runs the eviction algorithm and returns the evicted block's
-        content hash along with physical block id along with physical block id
-        """
-        pass
-
-    @abstractmethod
-    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: float):
-        """Adds block to the evictor, making it a candidate for eviction"""
-        pass
-
-    @abstractmethod
-    def update(self, block_id: int, last_accessed: float):
-        """Update corresponding block's access time in metadata"""
-        pass
-
-    @abstractmethod
-    def remove(self, block_id: int):
-        """Remove a given block id from the cache."""
-        pass
-
-    @property
-    @abstractmethod
-    def num_blocks(self) -> int:
-        pass
-
-
-class BlockMetaData:
-    """Data structure for storing key data describe cached block, so that
-    evitor could use to make its decision which one to choose for eviction
-
-    Here we use physical block id as the dict key, as there maybe several
-    blocks with the same content hash, but their physical id is unique.
-    """
-
-    def __init__(self, content_hash: int, num_hashed_tokens: int,
-                 last_accessed: float):
-        self.content_hash = content_hash
-        self.num_hashed_tokens = num_hashed_tokens
-        self.last_accessed = last_accessed
-
-
-class LRUEvictor(Evictor):
-    """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the Block. If there are multiple blocks with
-    the same last_accessed time, then the one with the largest num_hashed_tokens
-    will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chose arbitrarily
-    """
-
-    # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
-    # queue relative to the free table size. When this threshold is exceeded,
-    # a cleanup operation is triggered to reduce memory usage.
-    CLEANUP_THRESHOLD = 50
-
-    def __init__(self):
-        self.free_table: Dict[int, BlockMetaData] = {}
-        self.priority_queue = []
-
-    def __contains__(self, block_id: int) -> bool:
-        return block_id in self.free_table
-
-    def evict(self) -> Tuple[int, int]:
-        if len(self.free_table) == 0:
-            raise ValueError("No usable cache memory left")
-
-        while self.priority_queue:
-            # We do not remove outdated entries from the priority queue at the
-            # time of updating the last_accessed timestamp. Instead, outdated
-            # entries are filtered out here during eviction. Outdated entries
-            # would either not in the free table, or have older last accessed
-            # time.
-            last_accessed, _, block_id, content_hash = heapq.heappop(
-                self.priority_queue)
-            if (block_id in self.free_table and
-                    self.free_table[block_id].last_accessed == last_accessed):
-                self.free_table.pop(block_id)
-                return block_id, content_hash
-
-        raise ValueError("No usable cache memory left")
-
-    def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
-            last_accessed: float):
-        self.free_table[block_id] = BlockMetaData(content_hash,
-                                                  num_hashed_tokens,
-                                                  last_accessed)
-        heapq.heappush(
-            self.priority_queue,
-            (last_accessed, -num_hashed_tokens, block_id, content_hash))
-        self._cleanup_if_necessary()
-
-    def update(self, block_id: int, last_accessed: float):
-        self.free_table[block_id].last_accessed = last_accessed
-
-    def _cleanup_if_necessary(self):
-        if len(self.priority_queue) > LRUEvictor.CLEANUP_THRESHOLD * len(
-                self.free_table):
-            self._cleanup()
-
-    def _cleanup(self):
-        new_priority_queue: List[Tuple[float, int, int, int]] = []
-
-        for block_id, block in self.free_table.items():
-            new_priority_queue.append(
-                (block.last_accessed, -block.num_hashed_tokens, block_id,
-                 block.content_hash))
-        heapq.heapify(new_priority_queue)
-
-        self.priority_queue = new_priority_queue
-
-    def remove(self, block_id: int):
-        if block_id not in self.free_table:
-            raise ValueError(
-                "Attempting to remove block that's not in the evictor")
-        self.free_table.pop(block_id)
-
-    @property
-    def num_blocks(self) -> int:
-        return len(self.free_table)
-
-
-def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
-    if eviction_policy == EvictionPolicy.LRU:
-        return LRUEvictor()
-    else:
-        raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
deleted file mode 100644
index 69b9169ddd8a..000000000000
--- a/vllm/core/interfaces.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-from abc import ABC, abstractmethod
-from typing import List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple
-
-from vllm.sequence import Sequence, SequenceGroup
-from vllm.utils import Device
-
-
-class AllocStatus(enum.Enum):
-    """Result for BlockSpaceManager.can_allocate
-
-    1. Ok: seq_group can be allocated now.
-    2. Later: seq_group cannot be allocated.
-      The capacity of allocator is larger than seq_group required.
-    3. Never: seq_group can never be allocated.
-      The seq_group is too large to allocated in GPU.
-    """
-    OK = enum.auto()
-    LATER = enum.auto()
-    NEVER = enum.auto()
-
-
-class BlockSpaceManager(ABC):
-
-    @staticmethod
-    def get_block_space_manager_class(version: str):
-        version = version.lower()
-
-        if version == "selfattn":
-            from vllm.core.block_manager import SelfAttnBlockSpaceManager
-            return SelfAttnBlockSpaceManager
-
-        if version == "placeholder":
-            from vllm.core.placeholder_block_space_manager import (
-                PlaceholderBlockSpaceManager)
-            return PlaceholderBlockSpaceManager
-
-        raise ValueError(f"Unknown version {version=}")
-
-    @abstractmethod
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        pass
-
-    @abstractmethod
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        pass
-
-    @abstractmethod
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        pass
-
-    @abstractmethod
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        pass
-
-    @abstractmethod
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        pass
-
-    @abstractmethod
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        pass
-
-    @abstractmethod
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        pass
-
-    @abstractmethod
-    def free(self, seq: Sequence) -> None:
-        pass
-
-    @abstractmethod
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        pass
-
-    @abstractmethod
-    def get_num_free_gpu_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def get_num_free_cpu_blocks(self) -> int:
-        pass
-
-    @abstractmethod
-    def access_all_blocks_in_seq(
-        self,
-        seq: Sequence,
-        access_time: float,
-    ) -> None:
-        pass
-
-    @abstractmethod
-    def get_common_computed_block_ids(
-            self, seqs: List[Sequence]) -> GenericSequence[int]:
-        pass
-
-    @abstractmethod
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        pass
-
-    @abstractmethod
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        """Prefix cache hit rate. -1 means not supported or disabled."""
-        pass
-
-    @abstractmethod
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for specified or all devices."""
-        pass
-
-    @abstractmethod
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        pass
-
-    @abstractmethod
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        pass
\ No newline at end of file
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
deleted file mode 100644
index 679515924e85..000000000000
--- a/vllm/core/placeholder_block_space_manager.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Tuple
-
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.sequence import Sequence, SequenceGroup
-from vllm.utils import Device
-
-
-class PlaceholderBlockSpaceManager(BlockSpaceManager):
-    """A version of BlockSpaceManager for use in environments
-    where block management is not required. 
-    For example: pooling models or attention-free models like Mamba.
-
-    This class provides the same interface as BlockSpaceManager, but its
-    methods perform no actions or return simple values like True in specific
-    actions. It's designed to be used in scenarios where the overhead of
-    block management is unnecessary, such as in an embedding environment.
-    """
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        pass
-
-    def can_allocate(self,
-                     seq_group: SequenceGroup,
-                     num_lookahead_slots: int = 0) -> AllocStatus:
-        # Always return OK for dummy purposes
-        return AllocStatus.OK
-
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        # No actual allocation logic needed
-        pass
-
-    def can_append_slots(self, seq_group: SequenceGroup,
-                         num_lookahead_slots: int) -> bool:
-        return True
-
-    def append_slots(
-        self,
-        seq: Sequence,
-        num_lookahead_slots: int,
-    ) -> List[Tuple[int, int]]:
-        return []
-
-    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        pass
-
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> AllocStatus:
-        return AllocStatus.OK
-
-    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        return None  # type: ignore
-
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        return True
-
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        return None  # type: ignore
-
-    def free(self, seq: Sequence) -> None:
-        # No operation on free
-        return
-
-    def get_block_table(self, seq: Sequence) -> List[int]:
-        return None  # type: ignore
-
-    def get_num_free_gpu_blocks(self) -> int:
-        return 1
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return 1
-
-    def access_all_blocks_in_seq(
-        self,
-        seq: Sequence,
-        access_time: float,
-    ) -> None:
-        pass
-
-    def get_common_computed_block_ids(self,
-                                      seq_group: List[Sequence]) -> List[int]:
-        return []
-
-    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
-                                token_chunk_size: int):
-        pass
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        return -1
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return True
-
-    def get_num_cached_tokens(self, seq: Sequence) -> int:
-        return 0
-
-    def remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        return
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
deleted file mode 100644
index 61346da145bb..000000000000
--- a/vllm/core/scheduler.py
+++ /dev/null
@@ -1,2114 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import enum
-import os
-import random
-import time
-from collections import deque
-from dataclasses import dataclass, field
-from typing import Callable, Deque, Dict, Iterable, List, Optional
-from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Union
-
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
-from vllm.core.interfaces import AllocStatus, BlockSpaceManager
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupBase, SequenceGroupMetadata,
-                           SequenceGroupMetadataDelta, SequenceStage,
-                           SequenceStatus)
-from vllm.utils import Device, PyObjectCache
-
-logger = init_logger(__name__)
-
-# Test-only. If configured, decode is preempted with
-# ARTIFICIAL_PREEMPTION_PROB% probability.
-ENABLE_ARTIFICIAL_PREEMPT = bool(
-    os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False))  # noqa
-ARTIFICIAL_PREEMPTION_PROB = 0.5
-ARTIFICIAL_PREEMPTION_MAX_CNT = 500
-
-
-class PreemptionMode(enum.Enum):
-    """Preemption modes.
-
-    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
-    and swap them back in when the sequences are resumed.
-    2. Recomputation: Discard the blocks of the preempted sequences and
-    recompute them when the sequences are resumed, treating the sequences as
-    new prompts.
-    """
-
-    SWAP = enum.auto()
-    RECOMPUTE = enum.auto()
-
-
-@dataclass
-class SchedulingBudget:
-    """The available slots for scheduling.
-
-    TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
-    budget update from the same request_id. It is because in normal scheduling
-    path, we update RUNNING num_seqs ahead of time, meaning it could be
-    updated more than once when scheduling RUNNING requests. Since this won't
-    happen if we only have chunked prefill scheduling, we can remove this
-    feature from the API when chunked prefill is enabled by default.
-    """
-
-    token_budget: int
-    max_num_seqs: int
-    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
-    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
-    # Number of cached tokens in the batch.
-    _num_cached_tokens: int = 0
-    # Number of actual non-cached tokens in the batch.
-    _num_batched_tokens: int = 0
-    _num_curr_seqs: int = 0
-
-    def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
-        # We allow num_new_tokens to be 0 when the entire sequence has
-        # been cached.
-        assert num_new_tokens >= 0
-        assert num_new_seqs != 0
-        return (self.num_batched_tokens + num_new_tokens <= self.token_budget
-                and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
-
-    def remaining_token_budget(self):
-        return self.token_budget - self.num_batched_tokens
-
-    def add_num_batched_tokens(self,
-                               req_id: str,
-                               num_batched_tokens: int,
-                               num_cached_tokens: int = 0):
-        if req_id in self._request_ids_num_batched_tokens:
-            return
-        assert num_cached_tokens >= 0
-        assert num_batched_tokens >= 0
-
-        self._request_ids_num_batched_tokens.add(req_id)
-        self._num_batched_tokens += num_batched_tokens
-        self._num_cached_tokens += num_cached_tokens
-
-    def subtract_num_batched_tokens(self, req_id: str,
-                                    num_batched_tokens: int):
-        if req_id in self._request_ids_num_batched_tokens:
-            self._request_ids_num_batched_tokens.remove(req_id)
-            self._num_batched_tokens -= num_batched_tokens
-
-    def add_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._request_ids_num_curr_seqs:
-            return
-
-        self._request_ids_num_curr_seqs.add(req_id)
-        self._num_curr_seqs += num_curr_seqs
-
-    def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
-        if req_id in self._request_ids_num_curr_seqs:
-            self._request_ids_num_curr_seqs.remove(req_id)
-            self._num_curr_seqs -= num_curr_seqs
-
-    @property
-    def num_batched_tokens(self):
-        return self._num_batched_tokens
-
-    @property
-    def num_curr_seqs(self):
-        return self._num_curr_seqs
-
-    @property
-    def num_cached_tokens(self):
-        return self._num_cached_tokens
-
-
-@dataclass
-class ScheduledSequenceGroup:
-    # A sequence group that's scheduled.
-    seq_group: SequenceGroup
-    # The total chunk size (number of tokens) to process for next iteration.
-    # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
-    # chunked, it can be smaller than that.
-    token_chunk_size: int
-
-
-@dataclass
-class SchedulerOutputs:
-    """The scheduling decision made from a scheduler."""
-
-    # Scheduled sequence groups.
-    scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
-    # Number of prefill groups scheduled.
-    num_prefill_groups: int
-    # Total number of batched tokens.
-    num_batched_tokens: int
-    # Blocks to swap in. List of CPU -> GPU block number.
-    blocks_to_swap_in: List[Tuple[int, int]]
-    # Blocks to swap out. List of GPU -> CPU block number.
-    blocks_to_swap_out: List[Tuple[int, int]]
-    # Blocks to copy. Source to dest block.
-    blocks_to_copy: List[Tuple[int, int]]
-    # Sequence groups that are going to be ignored.
-    ignored_seq_groups: List[SequenceGroup]
-    # The number of slots for lookahead decoding.
-    num_lookahead_slots: int
-    # The number of requests in the running queue
-    running_queue_size: int
-    preempted: int
-
-    def __post_init__(self):
-        # Swap in and swap out should never happen at the same time.
-        assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
-
-        self.num_loras: int = len(self.lora_requests)
-        if self.num_loras > 0:
-            self._sort_by_lora_ids()
-
-    def is_empty(self) -> bool:
-        # NOTE: We do not consider the ignored sequence groups.
-        return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
-                and not self.blocks_to_swap_out and not self.blocks_to_copy)
-
-    def _sort_by_lora_ids(self):
-        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
-
-        def key_fn(group: ScheduledSequenceGroup):
-            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
-            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
-                # Sort sequence groups so that all prefills come before all
-                # decodes as required by chunked prefill.
-                return (not group.seq_group.is_prefill(), *key)
-            return key
-
-        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
-                                           key=key_fn)
-
-    @property
-    def lora_requests(self) -> Set[LoRARequest]:
-        return {
-            g.seq_group.lora_request
-            for g in self.scheduled_seq_groups
-            if g.seq_group.lora_request is not None
-        }
-
-
-@dataclass
-class SchedulerRunningOutputs:
-    """The requests that are scheduled from a running queue.
-
-    Could contain prefill (prefill that's chunked) or decodes. If there's not
-    enough memory, it can be preempted (for recompute) or swapped out.
-    """
-
-    # Selected sequences that are running and in a decoding phase.
-    decode_seq_groups: List[ScheduledSequenceGroup]
-    # Selected sequences that are running and in a prefill phase.
-    # I.e., it means the prefill has been chunked.
-    prefill_seq_groups: List[ScheduledSequenceGroup]
-    # The preempted sequences.
-    preempted: List[SequenceGroup]
-    # Sequences that are swapped out.
-    swapped_out: List[SequenceGroup]
-    # The blocks to swap out.
-    blocks_to_swap_out: List[Tuple[int, int]]
-    # The blocks to copy.
-    blocks_to_copy: List[Tuple[int, int]]
-    # The number of slots for lookahead decoding.
-    num_lookahead_slots: int
-
-    # Optimization for fast-access to seq_group lists
-    decode_seq_groups_list: List[SequenceGroup]
-    prefill_seq_groups_list: List[SequenceGroup]
-
-    @classmethod
-    def create_empty(cls) -> "SchedulerRunningOutputs":
-        return SchedulerRunningOutputs(
-            decode_seq_groups=[],
-            prefill_seq_groups=[],
-            preempted=[],
-            swapped_out=[],
-            blocks_to_swap_out=[],
-            blocks_to_copy=[],
-            num_lookahead_slots=0,
-            decode_seq_groups_list=[],
-            prefill_seq_groups_list=[],
-        )
-
-
-@dataclass
-class SchedulerSwappedInOutputs:
-    """The requests that are scheduled from a swap queue.
-
-    Could contain prefill (prefill that's chunked) or decodes.
-    """
-
-    # Selected sequences that are going to be swapped in and is in a
-    # decoding phase.
-    decode_seq_groups: List[ScheduledSequenceGroup]
-    # Selected sequences that are going to be swapped in and in a prefill
-    # phase. I.e., it means the prefill has been chunked.
-    prefill_seq_groups: List[ScheduledSequenceGroup]
-    # The blocks to swap in.
-    blocks_to_swap_in: List[Tuple[int, int]]
-    # The blocks to copy.
-    blocks_to_copy: List[Tuple[int, int]]
-    # The number of slots for lookahead decoding.
-    num_lookahead_slots: int
-    # Infeasible sequence groups.
-    infeasible_seq_groups: List[SequenceGroup]
-
-    @classmethod
-    def create_empty(cls) -> "SchedulerSwappedInOutputs":
-        return SchedulerSwappedInOutputs(
-            decode_seq_groups=[],
-            prefill_seq_groups=[],
-            blocks_to_swap_in=[],
-            blocks_to_copy=[],
-            num_lookahead_slots=0,
-            infeasible_seq_groups=[],
-        )
-
-
-@dataclass
-class SchedulerPrefillOutputs:
-    """The requests that are scheduled from a waiting queue.
-
-    Could contain a fresh prefill requests or preempted requests that need
-    to be recomputed from scratch.
-    """
-
-    # Selected sequences for prefill.
-    seq_groups: List[ScheduledSequenceGroup]
-    # Ignored sequence groups.
-    ignored_seq_groups: List[SequenceGroup]
-    num_lookahead_slots: int
-
-    @classmethod
-    def create_empty(cls) -> "SchedulerPrefillOutputs":
-        return SchedulerPrefillOutputs(
-            seq_groups=[],
-            ignored_seq_groups=[],
-            num_lookahead_slots=0,
-        )
-
-
-def seq_group_metadata_builder():
-    return SequenceGroupMetadata(request_id="",
-                                 is_prompt=False,
-                                 seq_data={},
-                                 sampling_params=None,
-                                 block_tables={})
-
-
-def scheduler_running_outputs_builder():
-    return SchedulerRunningOutputs(decode_seq_groups=[],
-                                   prefill_seq_groups=[],
-                                   preempted=[],
-                                   swapped_out=[],
-                                   blocks_to_swap_out=[],
-                                   blocks_to_copy=[],
-                                   num_lookahead_slots=0,
-                                   prefill_seq_groups_list=[],
-                                   decode_seq_groups_list=[])
-
-
-def scheduled_seq_group_builder():
-    return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
-                                  token_chunk_size=0)
-    # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
-
-
-@dataclass
-class PartialPrefillMetadata:
-    """Holds information about the partial prefills that are currently running
-    during a single iteration of the Scheduler.
-    When chunked prefill is enabled, we allow a certain number of seqs to be
-    partially prefilled during each iteration. Having multiple partial prefills
-    in flight allows us to minimize TTFT and avoid decode starvation in cases
-    where a single sequence group with a very large prompt blocks the queue for
-    too many iterations.
-    The number of long prefill requests is limited so that smaller
-    requests may jump the queue in front of them and get to the decode
-    phase faster.
-    """
-
-    # A minimum bound on the total number of prefills to be scheduled during
-    # this iteration
-    schedulable_prefills: int
-
-    # The number of long prefill requests currently running
-    long_prefills: int
-
-    scheduler_config: SchedulerConfig
-
-    def can_schedule(self, seq_group: SequenceGroup) -> bool:
-        """When concurrent partial prefills are enabled,
-        we limit the number of long requests and only accept
-        shorter requests from the queue while running them
-        concurrently"""
-        return not (seq_group.first_seq.get_num_new_tokens()
-                    > self.scheduler_config.long_prefill_token_threshold
-                    and self.long_prefills
-                    >= self.scheduler_config.max_long_partial_prefills
-                    and self.scheduler_config.max_num_partial_prefills > 1)
-
-    def maybe_increment_partial_prefills(self,
-                                         seq_group: SequenceGroup) -> None:
-        # When a new prefill is scheduled, we need to know if it is a
-        # long request
-        if (seq_group.first_seq.get_num_new_tokens()
-                > self.scheduler_config.long_prefill_token_threshold):
-            self.long_prefills += 1
-
-    @classmethod
-    def from_queues(
-        cls,
-        running: Deque[SequenceGroup],
-        waiting: Deque[SequenceGroup],
-        scheduler_config: SchedulerConfig,
-    ) -> "PartialPrefillMetadata":
-        """Create a PartialPrefillMetadata object from the current state of
-        the scheduler's queues.
-        This accounts for the currently running prefill requests, and peeks into
-        the waiting queue to see if there are more prefills to potentially be
-        scheduled during this iteration."""
-        prefills = 0
-        long_prefills = 0
-
-        waiting_long_prefills = 0
-
-        for sg in running:
-            if sg.first_seq.data.stage == SequenceStage.PREFILL:
-                prefills += 1
-                if (sg.first_seq.get_num_new_tokens()
-                        > scheduler_config.long_prefill_token_threshold):
-                    long_prefills += 1
-
-        for sg in waiting:
-            # Don't bother looping through the rest of the queue if we know
-            # there are already at
-            # least max_partial_prefills requests to fill
-            if prefills >= scheduler_config.max_num_partial_prefills:
-                break
-
-            # Don't count long requests from the waiting queue if we aren't
-            # going to schedule them anyway
-            if (sg.first_seq.get_num_new_tokens()
-                    > scheduler_config.long_prefill_token_threshold):
-                if (long_prefills + waiting_long_prefills
-                        >= scheduler_config.max_long_partial_prefills):
-                    continue
-                waiting_long_prefills += 1
-            prefills += 1
-
-        # NB: long_prefills and waiting_long_prefills are tracked separately.
-        # We don't account for the waiting requests here because we need to use
-        # this metadata to track how many have actually been scheduled.
-        return PartialPrefillMetadata(
-            schedulable_prefills=min(
-                prefills, scheduler_config.max_num_partial_prefills),
-            long_prefills=long_prefills,
-            scheduler_config=scheduler_config,
-        )
-
-
-class Scheduler:
-
-    def __init__(
-        self,
-        scheduler_config: SchedulerConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
-        pipeline_parallel_size: int = 1,
-        output_proc_callback: Optional[Callable] = None,
-    ) -> None:
-        self.scheduler_config = scheduler_config
-        self.cache_config = cache_config
-        # Note for LoRA scheduling: the current policy is extremely
-        # simple and NOT fair. It can lead to starvation of some
-        # LoRAs. This should be improved in the future.
-        self.lora_config = lora_config
-
-        version = "selfattn"
-        if (self.scheduler_config.runner_type == "pooling"
-                or self.cache_config.is_attention_free):
-            version = "placeholder"
-
-        BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
-            version)
-
-        num_gpu_blocks = cache_config.num_gpu_blocks
-        if num_gpu_blocks:
-            num_gpu_blocks //= pipeline_parallel_size
-
-        num_cpu_blocks = cache_config.num_cpu_blocks
-        if num_cpu_blocks:
-            num_cpu_blocks //= pipeline_parallel_size
-
-        # Create the block space manager.
-        self.block_manager = BlockSpaceManagerImpl(
-            block_size=self.cache_config.block_size,
-            num_gpu_blocks=num_gpu_blocks,
-            num_cpu_blocks=num_cpu_blocks,
-            sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching,
-        )
-
-        # Sequence groups in the WAITING state.
-        # Contain new prefill or preempted requests.
-        self.waiting: Deque[SequenceGroup] = deque()
-        # Sequence groups in the RUNNING state.
-        # Contain decode requests.
-        self.running: Deque[SequenceGroup] = deque()
-        # Sequence groups in the SWAPPED state.
-        # Contain decode requests that are swapped out.
-        self.swapped: Deque[SequenceGroup] = deque()
-        # Sequence groups finished requests ids since last step iteration.
-        # It lets the model know that any state associated with these requests
-        # can and must be released after the current step.
-        # This is used to evict the finished requests from the Mamba cache.
-        self._finished_requests_ids: List[str] = list()
-        # Time at previous scheduling step
-        self.prev_time = 0.0
-        # Did we schedule a prompt at previous step?
-        self.prev_prompt = False
-        # Latency of the last prompt step
-        self.last_prompt_latency = 0.0
-        # preemption mode, RECOMPUTE or SWAP
-        self.user_specified_preemption_mode = scheduler_config.preemption_mode
-
-        # The following field is test-only. It is used to inject artificial
-        # preemption.
-        self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
-        self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
-                                       if self.enable_artificial_preemption
-                                       else 0)
-        self.num_cumulative_preemption: int = 0
-
-        # Used to cache python objects
-        self._seq_group_metadata_cache: List[PyObjectCache] = []
-        self._scheduler_running_outputs_cache: List[PyObjectCache] = []
-        self._scheduled_seq_group_cache: List[PyObjectCache] = []
-
-        # For async output processing, we need to swap cache buffers between
-        # iterations. I.e. since the output processing is lagged one step,
-        # we cannot reuse the cached objects immediately when the schedule()
-        # is called again, but only when schedule() is called the second time.
-        self.output_proc_callback = output_proc_callback
-        self.use_async_output_proc = self.output_proc_callback is not None
-        self.num_cache_iters = 2 if self.use_async_output_proc else 1
-
-        self.cache_id = 0
-        for i in range(self.num_cache_iters):
-            self._seq_group_metadata_cache.append(
-                PyObjectCache(seq_group_metadata_builder))
-            self._scheduler_running_outputs_cache.append(
-                PyObjectCache(scheduler_running_outputs_builder))
-            self._scheduled_seq_group_cache.append(
-                PyObjectCache(scheduled_seq_group_builder))
-
-        # For async postprocessor, the extra decode run cannot be done
-        # when the request reaches max_model_len. In this case, the request
-        # will be stopped during schedule() call and added to this stop list
-        # for processing and deallocation by the free_finished_seq_groups()
-        self._async_stopped: List[SequenceGroup] = []
-
-        # List with the chunk sizes to hand out to each sequence depending
-        # on how many partial prefills are running. This is slightly faster than
-        # running an integer division every time a prefill is scheduled.
-        # This splits the budget evenly among all prefills.
-        self.partial_prefill_budget_lookup_list = [0] * (
-            self.scheduler_config.max_num_partial_prefills + 1)
-        self.partial_prefill_budget_lookup_list[0] = (
-            scheduler_config.max_num_batched_tokens)
-        for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
-            self.partial_prefill_budget_lookup_list[i] = (
-                scheduler_config.max_num_batched_tokens // i)
-
-    @property
-    def next_cache_id(self):
-        return (self.cache_id + 1) % self.num_cache_iters
-
-    @property
-    def lora_enabled(self) -> bool:
-        return bool(self.lora_config)
-
-    @property
-    def num_decoding_tokens_per_seq(self) -> int:
-        """The number of new tokens."""
-        return 1
-
-    def add_seq_group(self, seq_group: SequenceGroup) -> None:
-        # Add sequence groups to the waiting queue.
-        self.waiting.append(seq_group)
-
-    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
-        # Add sequence groups to the running queue.
-        # Only for testing purposes.
-        self.running.append(seq_group)
-
-    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
-        # Add sequence groups to the swapped queue.
-        # Only for testing purposes.
-        self.swapped.append(seq_group)
-
-    def abort_seq_group(
-        self,
-        request_id: Union[str, Iterable[str]],
-        seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
-    ) -> None:
-        """Aborts a sequence group with the given ID.
-
-        Check if the sequence group with the given ID
-            is present in any of the state queue.
-        If present, remove the sequence group from the state queue.
-            Also, if any of the sequences in the sequence group is not finished,
-                free the sequence with status `FINISHED_ABORTED`.
-        Otherwise, do nothing.
-
-        Args:
-            request_id: The ID(s) of the sequence group to abort.
-            seq_id_to_seq_group: helper for groups with n>1
-        """
-        if isinstance(request_id, str):
-            request_id = (request_id, )
-        request_ids = set(request_id)
-        seq_id_to_seq_group = seq_id_to_seq_group or {}
-        for state_queue in [self.waiting, self.running, self.swapped]:
-            aborted_groups: List[SequenceGroup] = []
-            for seq_group in state_queue:
-                # When n>1, seq_group.request_id looks like
-                # foo_parallel_sample_0, while request_ids is just foo, and we
-                # should resolve it as real_request_id to match.
-                if seq_group.request_id in seq_id_to_seq_group:
-                    real_request_id = seq_id_to_seq_group[
-                        seq_group.request_id].group_id
-                else:
-                    real_request_id = seq_group.request_id
-                if real_request_id in request_ids:
-                    # Appending aborted group into pending list.
-                    aborted_groups.append(seq_group)
-                    # We can't remove real_request_id in request_ids here,
-                    # because there may be other seq groups sharing the same
-                    # real_request_id
-            for aborted_group in aborted_groups:
-                # Remove the sequence group from the state queue.
-                state_queue.remove(aborted_group)
-                # Remove the aborted request from the Mamba cache.
-                self._finished_requests_ids.append(aborted_group.request_id)
-                for seq in aborted_group.get_seqs():
-                    if seq.is_finished():
-                        continue
-                    seq.status = SequenceStatus.FINISHED_ABORTED
-                    self.free_seq(seq)
-                if aborted_group.request_id in seq_id_to_seq_group:
-                    del seq_id_to_seq_group[aborted_group.request_id]
-
-                self._free_seq_group_cross_attn_blocks(aborted_group)
-
-    def _free_seq_group_cross_attn_blocks(
-        self,
-        seq_group: SequenceGroup,
-    ) -> None:
-        """
-        Free a sequence group from a cross-attention block table.
-        Has no effect on decoder-only models.
-        """
-        if seq_group.is_encoder_decoder():
-            self.block_manager.free_cross(seq_group)
-
-    def has_unfinished_seqs(self) -> bool:
-        return (len(self.waiting) != 0 or len(self.running) != 0
-                or len(self.swapped) != 0)
-
-    def get_prefix_cache_hit_rate(self, device: Device) -> float:
-        return self.block_manager.get_prefix_cache_hit_rate(device)
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        return self.block_manager.reset_prefix_cache(device)
-
-    def get_num_unfinished_seq_groups(self) -> int:
-        return len(self.waiting) + len(self.running) + len(self.swapped)
-
-    def get_and_reset_finished_requests_ids(self) -> List[str]:
-        """Flushes the list of request ids of previously finished seq_groups."""
-        finished_requests_ids = self._finished_requests_ids
-        self._finished_requests_ids = list()
-        return finished_requests_ids
-
-    def _schedule_running(
-        self,
-        budget: SchedulingBudget,
-        curr_loras: Optional[Set[int]],
-        enable_chunking: bool = False,
-        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
-    ) -> SchedulerRunningOutputs:
-        """Schedule sequence groups that are running.
-
-        Running queue should include decode and chunked prefill requests.
-
-        Args:
-            budget: The scheduling budget. The argument is in-place updated
-                when any decodes are preempted.
-            curr_loras: Currently batched lora request ids. The argument is
-                in-place updated when any decodes are preempted.
-            enable_chunking: If True, seq group can be chunked and only a
-                chunked number of tokens are scheduled  if
-                `budget.num_batched_tokens` has not enough capacity to schedule
-                all tokens.
-            partial_prefill_metadata: information about the partial prefills
-            that are currently running
-
-        Returns:
-            SchedulerRunningOutputs.
-        """
-        ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
-            self.cache_id].get_object()
-        ret.blocks_to_swap_out.clear()
-        ret.blocks_to_copy.clear()
-        ret.decode_seq_groups.clear()
-        ret.prefill_seq_groups.clear()
-        ret.preempted.clear()
-        ret.swapped_out.clear()
-
-        ret.num_lookahead_slots = self._get_num_lookahead_slots(
-            is_prefill=False, enable_chunking=enable_chunking)
-
-        ret.decode_seq_groups_list.clear()
-        ret.prefill_seq_groups_list.clear()
-
-        # Blocks that need to be swapped or copied before model execution.
-        blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
-        blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
-
-        decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
-        prefill_seq_groups: List[
-            ScheduledSequenceGroup] = ret.prefill_seq_groups
-        preempted: List[SequenceGroup] = ret.preempted
-        swapped_out: List[SequenceGroup] = ret.swapped_out
-
-        running_queue = self.running
-        assert len(self._async_stopped) == 0
-        while running_queue:
-            seq_group = running_queue[0]
-            # We discard the cached tokens info here because we don't need it
-            # for running sequence:
-            #   1. If a sequence is running with chunked prefill, the cached
-            #      tokens info was already used for the first prefill.
-            #   2. If a sequence is running with non-chunked prefill, then
-            #      there it's a decoding sequence, and the cached tokens info is
-            #      irrelevant.
-            num_uncached_new_tokens, _ = \
-                self._get_num_new_uncached_and_cached_tokens(
-                seq_group,
-                SequenceStatus.RUNNING,
-                enable_chunking,
-                budget,
-                partial_prefill_metadata,
-            )
-
-            num_running_tokens = num_uncached_new_tokens
-            if num_running_tokens == 0:
-                # No budget => Stop
-                break
-
-            running_queue.popleft()
-
-            # With async postprocessor, an extra decode run is done
-            # to process the final tokens. The check below avoids this extra
-            # decode run when the model max len is reached, in order to avoid
-            # a memory overflow.
-            if (self.use_async_output_proc and seq_group.seqs[0].get_len()
-                    > self.scheduler_config.max_model_len):
-                self._async_stopped.append(seq_group)
-                continue
-
-            # NOTE(woosuk): Preemption happens only when there is no available
-            # slot to keep all the sequence groups in the RUNNING state.
-            while not self._can_append_slots(seq_group, enable_chunking):
-                budget.subtract_num_batched_tokens(seq_group.request_id,
-                                                   num_running_tokens)
-                num_running_seqs = seq_group.get_max_num_running_seqs()
-                budget.subtract_num_seqs(seq_group.request_id,
-                                         num_running_seqs)
-
-                if (curr_loras is not None and seq_group.lora_int_id > 0
-                        and seq_group.lora_int_id in curr_loras):
-                    curr_loras.remove(seq_group.lora_int_id)
-
-                # Determine victim sequence
-                cont_loop = True
-                if running_queue:
-                    # Preempt the lowest-priority sequence group.
-                    victim_seq_group = running_queue.pop()
-                else:
-                    # No other sequence group can be preempted.
-                    # Preempt the current sequence group.
-                    # Note: This is also where we stop this loop
-                    # (since there is nothing else to preempt)
-                    victim_seq_group = seq_group
-                    cont_loop = False
-
-                # With async postprocessor, before preempting a sequence
-                # we need to ensure it has no pending async postprocessor
-                do_preempt = True
-                if self.use_async_output_proc:
-                    assert self.output_proc_callback is not None
-                    self.output_proc_callback(
-                        request_id=victim_seq_group.request_id)
-
-                    # It may be that the async pending "victim_seq_group"
-                    # becomes finished, in which case we simply free it.
-                    if victim_seq_group.is_finished():
-                        self._free_finished_seq_group(victim_seq_group)
-                        do_preempt = False
-
-                # Do preemption
-                if do_preempt:
-                    preempted_mode = self._preempt(victim_seq_group,
-                                                   blocks_to_swap_out)
-                    if preempted_mode == PreemptionMode.RECOMPUTE:
-                        preempted.append(victim_seq_group)
-                    else:
-                        swapped_out.append(victim_seq_group)
-
-                if not cont_loop:
-                    break
-            else:
-                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-                is_prefill = seq_group.is_prefill()
-
-                scheduled_seq_group: ScheduledSequenceGroup = (
-                    self._scheduled_seq_group_cache[
-                        self.cache_id].get_object())
-                scheduled_seq_group.seq_group = seq_group
-                if is_prefill:
-                    scheduled_seq_group.token_chunk_size = num_running_tokens
-                    prefill_seq_groups.append(scheduled_seq_group)
-                    ret.prefill_seq_groups_list.append(seq_group)
-                else:
-                    scheduled_seq_group.token_chunk_size = 1
-                    decode_seq_groups.append(scheduled_seq_group)
-                    ret.decode_seq_groups_list.append(seq_group)
-
-                budget.add_num_batched_tokens(seq_group.request_id,
-                                              num_running_tokens)
-                # OPTIMIZATION:  Note that get_max_num_running_seqs is
-                # expensive. For the default scheduling chase where
-                # enable_chunking is False, num_seqs are updated before running
-                # this method, so we don't have to update it again here.
-                if enable_chunking:
-                    num_running_seqs = seq_group.get_max_num_running_seqs()
-                    budget.add_num_seqs(seq_group.request_id, num_running_seqs)
-                if curr_loras is not None and seq_group.lora_int_id > 0:
-                    curr_loras.add(seq_group.lora_int_id)
-
-        self._scheduler_running_outputs_cache[self.next_cache_id].reset()
-        self._scheduled_seq_group_cache[self.next_cache_id].reset()
-
-        return ret
-
-    def _schedule_swapped(
-        self,
-        budget: SchedulingBudget,
-        curr_loras: Optional[Set[int]],
-        enable_chunking: bool = False,
-    ) -> SchedulerSwappedInOutputs:
-        """Schedule sequence groups that are swapped out.
-
-        It schedules swapped requests as long as it fits `budget` and
-        curr_loras <= max_lora from the scheduling config. The input arguments
-        `budget` and `curr_loras` are updated based on scheduled seq_groups.
-
-        Args:
-            budget: The scheduling budget. The argument is in-place updated
-                when any requests are swapped in.
-            curr_loras: Currently batched lora request ids. The argument is
-                in-place updated when any requests are swapped in.
-            enable_chunking: If True, seq group can be chunked and only a
-                chunked number of tokens are scheduled  if
-                `budget.num_batched_tokens` has not enough capacity to schedule
-                all tokens.
-
-        Returns:
-            SchedulerSwappedInOutputs.
-        """
-        # Blocks that need to be swapped or copied before model execution.
-        blocks_to_swap_in: List[Tuple[int, int]] = []
-        blocks_to_copy: List[Tuple[int, int]] = []
-        decode_seq_groups: List[ScheduledSequenceGroup] = []
-        prefill_seq_groups: List[ScheduledSequenceGroup] = []
-        infeasible_seq_groups: List[SequenceGroup] = []
-
-        swapped_queue = self.swapped
-
-        leftover_swapped: Deque[SequenceGroup] = deque()
-        while swapped_queue:
-            seq_group = swapped_queue[0]
-
-            # If the sequence group cannot be swapped in, stop.
-            is_prefill = seq_group.is_prefill()
-            alloc_status = self.block_manager.can_swap_in(
-                seq_group,
-                self._get_num_lookahead_slots(is_prefill, enable_chunking))
-            if alloc_status == AllocStatus.LATER:
-                break
-            elif alloc_status == AllocStatus.NEVER:
-                logger.warning(
-                    "Failing the request %s because there's not enough kv "
-                    "cache blocks to run the entire sequence.",
-                    seq_group.request_id,
-                )
-                for seq in seq_group.get_seqs():
-                    seq.status = SequenceStatus.FINISHED_IGNORED
-                infeasible_seq_groups.append(seq_group)
-                swapped_queue.popleft()
-                continue
-
-            lora_int_id = 0
-            if self.lora_enabled:
-                lora_int_id = seq_group.lora_int_id
-                assert curr_loras is not None
-                assert self.lora_config is not None
-                if (lora_int_id > 0 and (lora_int_id not in curr_loras)
-                        and len(curr_loras) >= self.lora_config.max_loras):
-                    # We don't have a space for another LoRA, so
-                    # we ignore this request for now.
-                    leftover_swapped.appendleft(seq_group)
-                    swapped_queue.popleft()
-                    continue
-
-            # The total number of sequences in the RUNNING state should not
-            # exceed the maximum number of sequences.
-            num_new_seqs = seq_group.get_max_num_running_seqs()
-            num_new_tokens_uncached, num_new_tokens_cached = (
-                self._get_num_new_uncached_and_cached_tokens(
-                    seq_group, SequenceStatus.SWAPPED, enable_chunking,
-                    budget))
-
-            if num_new_tokens_uncached == 0 or not budget.can_schedule(
-                    num_new_tokens=num_new_tokens_uncached,
-                    num_new_seqs=num_new_seqs,
-            ):
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.SWAPPED)
-                break
-
-            if lora_int_id > 0 and curr_loras is not None:
-                curr_loras.add(lora_int_id)
-            swapped_queue.popleft()
-            self._swap_in(seq_group, blocks_to_swap_in)
-            self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-            if is_prefill:
-                prefill_seq_groups.append(
-                    ScheduledSequenceGroup(
-                        seq_group,
-                        token_chunk_size=num_new_tokens_uncached +
-                        num_new_tokens_cached,
-                    ))
-            else:
-                decode_seq_groups.append(
-                    ScheduledSequenceGroup(seq_group, token_chunk_size=1))
-            budget.add_num_batched_tokens(
-                seq_group.request_id,
-                num_batched_tokens=num_new_tokens_uncached,
-                num_cached_tokens=num_new_tokens_cached,
-            )
-            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
-
-        swapped_queue.extendleft(leftover_swapped)
-
-        return SchedulerSwappedInOutputs(
-            decode_seq_groups=decode_seq_groups,
-            prefill_seq_groups=prefill_seq_groups,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_copy=blocks_to_copy,
-            num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=False, enable_chunking=enable_chunking),
-            infeasible_seq_groups=infeasible_seq_groups,
-        )
-
-    def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
-        if (self.scheduler_config.chunked_prefill_enabled
-                and not self.scheduler_config.is_multi_step):
-            prompt_limit = self.scheduler_config.max_model_len
-        else:
-            prompt_limit = min(
-                self.scheduler_config.max_model_len,
-                self.scheduler_config.max_num_batched_tokens,
-            )
-
-        # Model is fine tuned with long context. Return the fine tuned max_len.
-        if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
-            assert prompt_limit <= seq_group.lora_request.long_lora_max_len
-            return seq_group.lora_request.long_lora_max_len
-        else:
-            return prompt_limit
-
-    def _get_priority(self,
-                      seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
-        """Get the priority of the sequence group.
-        Highest preference to user-defined priority, followed by arrival time.
-        Args:
-            seq_group: The sequence group input.
-        Returns:
-            The priority of the sequence group.
-        """
-        return seq_group.priority, seq_group.arrival_time
-
-    def _schedule_priority_preemption(
-        self,
-        budget: SchedulingBudget,
-    ) -> int:
-        """Sorts waiting and running queue. Also, force preempt requests
-        from the running queue if their priority is lower.
-        Priority-based preemption is used with the priority policy.
-        Args:
-            budget: The scheduling budget. The argument is in-place updated
-                when any requests are scheduled.
-        Returns:
-            A count of priority-based preemptions.
-        """
-
-        waiting_queue = self.waiting
-
-        running_queue = deque(sorted(self.running, key=self._get_priority))
-
-        blocks_to_swap_out: List[Tuple[int, int]] = []
-        force_preemption_count = 0
-
-        if waiting_queue:
-            seq_group = waiting_queue.popleft()
-            num_new_seqs = seq_group.get_max_num_running_seqs()
-            num_new_tokens_uncached, _ = \
-                self._get_num_new_uncached_and_cached_tokens(
-                seq_group, SequenceStatus.WAITING, False, budget)
-
-            # Only preempt if priority inversion exists
-            while running_queue and self._get_priority(
-                    running_queue[-1]) > self._get_priority(seq_group):
-                # Only preempt if waiting sequence cannot be allocated
-                can_allocate = self.block_manager.can_allocate(seq_group)
-                if (num_new_tokens_uncached > 0
-                        and can_allocate == AllocStatus.OK
-                        and budget.can_schedule(
-                            num_new_tokens=num_new_tokens_uncached,
-                            num_new_seqs=num_new_seqs,
-                        )):
-                    break
-
-                # Adjust budget to remove the victim sequence group
-                vseq_group = running_queue.pop()
-                num_running_tokens_uncached, _ = (
-                    self._get_num_new_uncached_and_cached_tokens(
-                        vseq_group, SequenceStatus.RUNNING, False, budget))
-                budget.subtract_num_batched_tokens(
-                    vseq_group.request_id, num_running_tokens_uncached)
-                num_running_seqs = vseq_group.get_max_num_running_seqs()
-                budget.subtract_num_seqs(vseq_group.request_id,
-                                         num_running_seqs)
-
-                # Preempt out the victim sequence group
-                self._preempt(vseq_group, blocks_to_swap_out)
-                waiting_queue.appendleft(vseq_group)
-                force_preemption_count += 1
-            # Put the sequence back into the waiting queue
-            waiting_queue.appendleft(seq_group)
-
-            self.remove_seq_from_computed_blocks_tracker(
-                seq_group, SequenceStatus.WAITING)
-
-        waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
-
-        self.waiting = waiting_queue
-        self.running = running_queue
-        return force_preemption_count
-
-    def _schedule_prefills(
-        self,
-        budget: SchedulingBudget,
-        curr_loras: Optional[Set[int]],
-        enable_chunking: bool = False,
-        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
-    ) -> SchedulerPrefillOutputs:
-        """Schedule sequence groups that are in prefill stage.
-
-        Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
-        as a new prefill (that starts from beginning -> most recently generated
-        tokens).
-
-        It schedules waiting requests as long as it fits `budget` and
-        curr_loras <= max_lora from the scheduling config. The input arguments
-        `budget` and `curr_loras` are updated based on scheduled seq_groups.
-
-        Args:
-            budget: The scheduling budget. The argument is in-place updated
-                when any requests are scheduled.
-            curr_loras: Currently batched lora request ids. The argument is
-                in-place updated when any requests are scheduled.
-            enable_chunking: If True, seq group can be chunked and only a
-                chunked number of tokens are scheduled  if
-                `budget.num_batched_tokens` has not enough capacity to schedule
-                all tokens.
-            partial_prefill_metadata: information about the partial prefills
-                that are currently running
-
-        Returns:
-            SchedulerPrefillOutputs.
-        """
-        if budget.remaining_token_budget() == 0:
-            # Do nothing: Can't add any more prefill anyway
-            return SchedulerPrefillOutputs(
-                seq_groups=[],
-                ignored_seq_groups=[],
-                num_lookahead_slots=self._get_num_lookahead_slots(
-                    is_prefill=True, enable_chunking=enable_chunking),
-            )
-        ignored_seq_groups: List[SequenceGroup] = []
-        seq_groups: List[ScheduledSequenceGroup] = []
-        using_prompt_embeds: bool = False
-
-        waiting_queue = self.waiting
-
-        leftover_waiting_sequences: Deque[SequenceGroup] = deque()
-        while self._passed_delay(time.time()) and waiting_queue:
-            seq_group = waiting_queue[0]
-
-            waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
-            assert len(waiting_seqs) == 1, (
-                "Waiting sequence group should have only one prompt "
-                "sequence.")
-            if (partial_prefill_metadata is not None
-                    and not partial_prefill_metadata.can_schedule(seq_group)):
-                leftover_waiting_sequences.appendleft(seq_group)
-                waiting_queue.popleft()
-                continue
-            num_new_tokens_uncached, num_new_tokens_cached = (
-                self._get_num_new_uncached_and_cached_tokens(
-                    seq_group,
-                    SequenceStatus.WAITING,
-                    enable_chunking,
-                    budget,
-                    partial_prefill_metadata=partial_prefill_metadata,
-                ))
-            num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
-
-            if not enable_chunking:
-                num_prompt_tokens = waiting_seqs[0].get_len()
-                assert num_new_tokens == num_prompt_tokens
-
-            prompt_limit = self._get_prompt_limit(seq_group)
-            if num_new_tokens > prompt_limit:
-                logger.warning(
-                    "Input prompt (%d tokens) is too long"
-                    " and exceeds limit of %d",
-                    num_new_tokens,
-                    prompt_limit,
-                )
-                for seq in waiting_seqs:
-                    seq.status = SequenceStatus.FINISHED_IGNORED
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.FINISHED_IGNORED)
-                ignored_seq_groups.append(seq_group)
-                waiting_queue.popleft()
-                continue
-
-            num_lookahead_slots: int = 0
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                num_lookahead_slots = self._get_num_lookahead_slots(
-                    True, enable_chunking)
-
-            # If the sequence group cannot be allocated, stop.
-            can_allocate = self.block_manager.can_allocate(
-                seq_group, num_lookahead_slots=num_lookahead_slots)
-            if can_allocate == AllocStatus.LATER:
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.WAITING)
-                break
-            elif can_allocate == AllocStatus.NEVER:
-                logger.warning(
-                    "Input prompt (%d tokens) + lookahead slots (%d) is "
-                    "too long and exceeds the capacity of block_manager",
-                    num_new_tokens,
-                    num_lookahead_slots,
-                )
-                for seq in waiting_seqs:
-                    seq.status = SequenceStatus.FINISHED_IGNORED
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.FINISHED_IGNORED)
-                ignored_seq_groups.append(seq_group)
-                waiting_queue.popleft()
-                continue
-
-            # We cannot mix sequence groups that use prompt embeds and
-            # those that do not.
-            if len(seq_groups) == 0:
-                using_prompt_embeds = seq_group.uses_prompt_embeds()
-            if using_prompt_embeds != seq_group.uses_prompt_embeds():
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.WAITING)
-                leftover_waiting_sequences.appendleft(seq_group)
-                waiting_queue.popleft()
-                continue
-
-            lora_int_id = 0
-            if self.lora_enabled:
-                lora_int_id = seq_group.lora_int_id
-                assert curr_loras is not None
-                assert self.lora_config is not None
-                if (self.lora_enabled and lora_int_id > 0
-                        and lora_int_id not in curr_loras
-                        and len(curr_loras) >= self.lora_config.max_loras):
-                    # We don't have a space for another LoRA, so
-                    # we ignore this request for now.
-                    self.remove_seq_from_computed_blocks_tracker(
-                        seq_group, SequenceStatus.WAITING)
-                    leftover_waiting_sequences.appendleft(seq_group)
-                    waiting_queue.popleft()
-                    continue
-
-            if (budget.num_batched_tokens
-                    >= self.scheduler_config.max_num_batched_tokens):
-                # We've reached the budget limit - since there might be
-                # continuous prefills in the running queue, we should break
-                # to avoid scheduling any new prefills.
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.WAITING)
-                break
-
-            num_new_seqs = seq_group.get_max_num_running_seqs()
-            if num_new_tokens_uncached == 0 or not budget.can_schedule(
-                    num_new_tokens=num_new_tokens_uncached,
-                    num_new_seqs=num_new_seqs,
-            ):
-                self.remove_seq_from_computed_blocks_tracker(
-                    seq_group, SequenceStatus.WAITING)
-                break
-
-            # Can schedule this request.
-            if curr_loras is not None and lora_int_id > 0:
-                curr_loras.add(lora_int_id)
-            waiting_queue.popleft()
-            self._allocate_and_set_running(seq_group)
-
-            if partial_prefill_metadata is not None:
-                partial_prefill_metadata.maybe_increment_partial_prefills(
-                    seq_group)
-
-            if enable_chunking and self.scheduler_config.is_multi_step:
-                blocks_to_copy: List[Tuple[int, int]] = []
-                # init_multi_step_from_lookahead_slots happens in append_slots
-                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-                # This assert will trip when a copy-on-write happens. This is
-                # not a concern as the very first sequence-group block
-                # allocation happens above. Still, we have the assert to
-                # catch any edge-cases.
-                assert not blocks_to_copy
-            else:
-                seq_group.init_multi_step_from_lookahead_slots(
-                    num_lookahead_slots,
-                    num_scheduler_steps=self.scheduler_config.
-                    num_scheduler_steps,
-                    is_multi_step=self.scheduler_config.is_multi_step,
-                    enable_chunking=enable_chunking,
-                )
-
-            seq_groups.append(
-                ScheduledSequenceGroup(seq_group=seq_group,
-                                       token_chunk_size=num_new_tokens))
-            budget.add_num_batched_tokens(
-                seq_group.request_id,
-                num_batched_tokens=num_new_tokens_uncached,
-                num_cached_tokens=num_new_tokens_cached,
-            )
-            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
-
-        # Queue requests that couldn't be scheduled.
-        waiting_queue.extendleft(leftover_waiting_sequences)
-        if len(seq_groups) > 0:
-            self.prev_prompt = True
-
-        return SchedulerPrefillOutputs(
-            seq_groups=seq_groups,
-            ignored_seq_groups=ignored_seq_groups,
-            num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=True, enable_chunking=enable_chunking),
-        )
-
-    def _schedule_default(self) -> SchedulerOutputs:
-        """Schedule queued requests.
-
-        The current policy is designed to optimize the throughput. First,
-        it batches as many prefill requests as possible. And it schedules
-        decodes. If there's a pressure on GPU memory, decode requests can
-        be swapped or preempted.
-        """
-        # Include running requests to the budget.
-        budget = SchedulingBudget(
-            token_budget=self.scheduler_config.max_num_batched_tokens,
-            max_num_seqs=self.scheduler_config.max_num_seqs,
-        )
-        # Make sure we include num running seqs before scheduling prefill,
-        # so that we don't schedule beyond max_num_seqs for prefill.
-        for seq_group in self.running:
-            budget.add_num_seqs(seq_group.request_id,
-                                seq_group.get_max_num_running_seqs())
-        curr_loras = (set(
-            seq_group.lora_int_id for seq_group in self.running
-            if seq_group.lora_int_id > 0) if self.lora_enabled else None)
-
-        prefills = SchedulerPrefillOutputs.create_empty()
-        running_scheduled = SchedulerRunningOutputs.create_empty()
-        swapped_in = SchedulerSwappedInOutputs.create_empty()
-
-        # If any requests are swapped, prioritized swapped requests.
-        if not self.swapped:
-            prefills = self._schedule_prefills(budget,
-                                               curr_loras,
-                                               enable_chunking=False)
-
-        if len(prefills.seq_groups
-               ) == 0 and self.scheduler_config.policy == "priority":
-            self._schedule_priority_preemption(budget)
-
-        # Don't schedule decodes if prefills are scheduled.
-        # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
-        # only contains decode requests, not chunked prefills.
-        if len(prefills.seq_groups) == 0:
-            running_scheduled = self._schedule_running(budget,
-                                                       curr_loras,
-                                                       enable_chunking=False)
-
-            # If any sequence group is preempted, do not swap in any sequence
-            # group. because it means there's no slot for new running requests.
-            if (len(running_scheduled.preempted) +
-                    len(running_scheduled.swapped_out) == 0):
-                swapped_in = \
-                    self._schedule_swapped(budget, curr_loras)
-
-        assert (budget.num_batched_tokens
-                <= self.scheduler_config.max_num_batched_tokens)
-        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
-
-        # Update waiting requests.
-        self.waiting.extendleft(running_scheduled.preempted)
-        # Update new running requests.
-        if len(prefills.seq_groups) > 0:
-            self.running.extend([s.seq_group for s in prefills.seq_groups])
-
-        self.running.extend(running_scheduled.decode_seq_groups_list)
-
-        if len(swapped_in.decode_seq_groups) > 0:
-            self.running.extend(
-                [s.seq_group for s in swapped_in.decode_seq_groups])
-
-        # Update swapped requests.
-        self.swapped.extend(running_scheduled.swapped_out)
-        preempted = len(running_scheduled.preempted) + len(
-            running_scheduled.swapped_out)
-
-        # There should be no prefill from running queue because this policy
-        # doesn't allow chunked prefills.
-        assert len(running_scheduled.prefill_seq_groups) == 0
-        assert len(swapped_in.prefill_seq_groups) == 0
-
-        # Merge lists
-        num_prefill_groups = len(prefills.seq_groups)
-        ignored_seq_groups_for_embeds = list[SequenceGroup]()
-        if num_prefill_groups > 0:
-            scheduled_seq_groups = prefills.seq_groups
-            scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
-            ignored_seq_groups_for_embeds.clear()
-        else:
-            scheduled_seq_groups = running_scheduled.decode_seq_groups
-            if len(scheduled_seq_groups) > 0:
-                using_prompt_embeds = scheduled_seq_groups[
-                    0].seq_group.uses_prompt_embeds()
-                ignored_seq_groups_for_embeds.clear()
-                indices_ignored = list[int]()
-                for i, schedule_seq_group in enumerate(scheduled_seq_groups):
-                    if using_prompt_embeds !=\
-                        schedule_seq_group.seq_group.uses_prompt_embeds():
-                        ignored_seq_groups_for_embeds.append(
-                            schedule_seq_group.seq_group)
-                        indices_ignored.append(i)
-                if len(ignored_seq_groups_for_embeds) > 0:
-                    scheduled_seq_groups = [
-                        group for i, group in enumerate(scheduled_seq_groups)
-                        if i not in indices_ignored
-                    ]
-            else:
-                ignored_seq_groups_for_embeds.clear()
-
-        scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
-
-        blocks_to_copy = running_scheduled.blocks_to_copy
-        blocks_to_copy.extend(swapped_in.blocks_to_copy)
-
-        ignored_seq_groups = prefills.ignored_seq_groups
-        ignored_seq_groups.extend(ignored_seq_groups_for_embeds)
-        ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
-
-        return SchedulerOutputs(
-            scheduled_seq_groups=scheduled_seq_groups,
-            num_prefill_groups=num_prefill_groups,
-            num_batched_tokens=budget.num_batched_tokens +
-            budget.num_cached_tokens,
-            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
-            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            ignored_seq_groups=ignored_seq_groups,
-            num_lookahead_slots=running_scheduled.num_lookahead_slots,
-            running_queue_size=len(self.running),
-            preempted=preempted,
-        )
-
-    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
-        """Schedule queued requests.
-
-        Chunked prefill allows to chunk prefill requests, batch them together
-        with decode requests. This policy 1. schedule as many decoding requests
-        as possible. 2. schedule chunked prefill requests that are not
-        finished. 3. schedule swapped request. 4. schedule new prefill
-        requests.
-
-        The policy can sustain the high GPU utilization because it can put
-        prefill and decodes requests to the same batch, while it improves
-        inter token latency because decodes requests don't need to be blocked
-        by prefill requests.
-        """
-        budget = SchedulingBudget(
-            token_budget=self.scheduler_config.max_num_batched_tokens,
-            max_num_seqs=self.scheduler_config.max_num_seqs,
-        )
-        curr_loras: Set[int] = set()
-
-        prefills = SchedulerPrefillOutputs.create_empty()
-        swapped_in = SchedulerSwappedInOutputs.create_empty()
-
-        # Create partial prefill metadata
-        partial_prefill_metadata = PartialPrefillMetadata.from_queues(
-            running=self.running,
-            waiting=self.waiting,
-            scheduler_config=self.scheduler_config,
-        )
-
-        # Decoding should be always scheduled first by fcfs.
-        running_scheduled = self._schedule_running(
-            budget,
-            curr_loras,
-            enable_chunking=True,
-            partial_prefill_metadata=partial_prefill_metadata,
-        )
-
-        # Schedule swapped out requests.
-        # If preemption happens, it means we don't have space for swap-in.
-        if len(running_scheduled.preempted) + len(
-                running_scheduled.swapped_out) == 0:
-            swapped_in = self._schedule_swapped(budget, curr_loras)
-
-        prefills = self._schedule_prefills(
-            budget,
-            curr_loras,
-            enable_chunking=True,
-            partial_prefill_metadata=partial_prefill_metadata,
-        )
-
-        assert (budget.num_batched_tokens
-                <= self.scheduler_config.max_num_batched_tokens)
-        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
-
-        # Update waiting requests.
-        self.waiting.extendleft(running_scheduled.preempted)
-
-        # Update new running requests.
-        # By default, vLLM scheduler prioritizes prefills.
-        # Once chunked prefill is enabled,
-        # the policy is changed to prioritize decode requests.
-        self.running.extend(
-            [s.seq_group for s in swapped_in.decode_seq_groups])
-        self.running.extend(
-            [s.seq_group for s in swapped_in.prefill_seq_groups])
-        self.running.extend(
-            [s.seq_group for s in running_scheduled.decode_seq_groups])
-        # Because multiple prefills may be running concurrently, we need to
-        # make sure that prefills which are scheduled to finish are listed
-        # before those that won't. This is so that on the next scheduling
-        # iteration when they have transitioned to the decode stage, they are
-        # properly prioritized over sequences that are still in the prefill
-        # stage.
-        self.running.extend(
-            self._order_finishing_prefills_first(
-                running_scheduled.prefill_seq_groups))
-        self.running.extend([s.seq_group for s in prefills.seq_groups])
-
-        # Update swapped requests.
-        self.swapped.extend(running_scheduled.swapped_out)
-        # Put prefills first due to Attention backend ordering assumption.
-        scheduled_seq_groups = (prefills.seq_groups +
-                                running_scheduled.prefill_seq_groups +
-                                swapped_in.prefill_seq_groups +
-                                running_scheduled.decode_seq_groups +
-                                swapped_in.decode_seq_groups)
-        num_prefill_groups = (len(prefills.seq_groups) +
-                              len(swapped_in.prefill_seq_groups) +
-                              len(running_scheduled.prefill_seq_groups))
-        # If all prompts, then we set num_lookahead_slots to 0
-        # this allows us to go through the `no_spec` path in
-        # `spec_decode_worker.py`
-        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
-        num_lookahead_slots = (0 if
-                               (all_prefills
-                                and not self.scheduler_config.is_multi_step)
-                               else running_scheduled.num_lookahead_slots)
-        return SchedulerOutputs(
-            scheduled_seq_groups=scheduled_seq_groups,
-            num_prefill_groups=num_prefill_groups,
-            num_batched_tokens=budget.num_batched_tokens +
-            budget.num_cached_tokens,
-            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
-            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
-            blocks_to_copy=running_scheduled.blocks_to_copy +
-            swapped_in.blocks_to_copy,
-            ignored_seq_groups=prefills.ignored_seq_groups +
-            swapped_in.infeasible_seq_groups,
-            num_lookahead_slots=num_lookahead_slots,
-            running_queue_size=len(self.running),
-            preempted=(len(running_scheduled.preempted) +
-                       len(running_scheduled.swapped_out)),
-        )
-
-    def _order_finishing_prefills_first(
-        self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
-    ) -> List[SequenceGroup]:
-        """Returns a list of prefilling SequenceGroups where sequences that are
-        scheduled to finish prefilling are listed first"""
-        finishing = [
-            s.seq_group for s in scheduled_prefill_seqs
-            if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
-        ]
-        not_finishing = [
-            s.seq_group for s in scheduled_prefill_seqs
-            if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
-        ]
-        return finishing + not_finishing
-
-    def _schedule(self) -> SchedulerOutputs:
-        """Schedule queued requests."""
-        if self.scheduler_config.chunked_prefill_enabled:
-            return self._schedule_chunked_prefill()
-        else:
-            return self._schedule_default()
-
-    def _can_append_slots(self, seq_group: SequenceGroup,
-                          enable_chunking: bool) -> bool:
-        """Determine whether or not we have enough space in the KV cache to
-        continue generation of the sequence group.
-        """
-        # It is True only for testing case to trigger artificial preemption.
-        if (self.enable_artificial_preemption
-                and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
-                and self.artificial_preempt_cnt > 0):
-            self.artificial_preempt_cnt -= 1
-            return False
-
-        is_prefill = seq_group.is_prefill()
-        num_lookahead_slots = self._get_num_lookahead_slots(
-            is_prefill, enable_chunking)
-
-        if is_prefill and num_lookahead_slots > 0:
-            # Appending prefill slots only happens multi-step and
-            # chunked-prefill are enabled together.
-            assert self.scheduler_config.is_multi_step and enable_chunking
-
-        return self.block_manager.can_append_slots(
-            seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
-
-    def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
-        # async_output_proc is allowed only when we have a single sequence
-        # in the sequence group
-        no_single_seq = seq_group.sampling_params is None or (
-            seq_group.sampling_params.n == 1)
-        return no_single_seq
-
-    def schedule(
-            self
-    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
-        # Schedule sequence groups.
-        # This function call changes the internal states of the scheduler
-        # such as self.running, self.swapped, and self.waiting.
-        scheduler_start_time = time.perf_counter()
-
-        scheduler_outputs: SchedulerOutputs = self._schedule()
-        now = time.time()
-
-        if not self.cache_config.enable_prefix_caching:
-            common_computed_block_nums = []
-
-        allow_async_output_proc: bool = self.use_async_output_proc
-
-        # Create input data structures.
-        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        for i, scheduled_seq_group in enumerate(
-                scheduler_outputs.scheduled_seq_groups):
-            seq_group = scheduled_seq_group.seq_group
-            token_chunk_size = scheduled_seq_group.token_chunk_size
-            seq_group.maybe_set_first_scheduled_time(now)
-
-            seq_group_metadata = self._seq_group_metadata_cache[
-                self.cache_id].get_object()
-            seq_group_metadata.seq_data.clear()
-            seq_group_metadata.block_tables.clear()
-
-            # seq_id -> SequenceData
-            seq_data: Dict[int, SequenceData] = {}
-            # seq_id -> physical block numbers
-            block_tables: Dict[int, List[int]] = {}
-
-            if seq_group.is_encoder_decoder():
-                # Encoder associated with SequenceGroup
-                encoder_seq = seq_group.get_encoder_seq()
-                assert encoder_seq is not None
-                encoder_seq_data = encoder_seq.data
-                # Block table for cross-attention
-                # Also managed at SequenceGroup level
-                cross_block_table = self.block_manager.get_cross_block_table(
-                    seq_group)
-            else:
-                encoder_seq_data = None
-                cross_block_table = None
-
-            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-                seq_id = seq.seq_id
-                seq_data[seq_id] = seq.data
-                block_tables[seq_id] = self.block_manager.get_block_table(seq)
-                self.block_manager.access_all_blocks_in_seq(seq, now)
-
-            if self.cache_config.enable_prefix_caching:
-                common_computed_block_nums = (
-                    self.block_manager.get_common_computed_block_ids(
-                        seq_group.get_seqs(status=SequenceStatus.RUNNING)))
-
-            do_sample = True
-            is_prompt = seq_group.is_prefill()
-            # We should send the metadata to workers when the first prefill
-            # is sent. Subsequent requests could be chunked prefill or decode.
-            is_first_prefill = False
-            if is_prompt:
-                seqs = seq_group.get_seqs()
-                # Prefill has only 1 sequence.
-                assert len(seqs) == 1
-                num_computed_tokens = seqs[0].data.get_num_computed_tokens()
-                is_first_prefill = num_computed_tokens == 0
-                # In the next iteration, all prompt tokens are not computed.
-                # It means the prefill is chunked, and we don't need sampling.
-                # NOTE: We use get_len instead of get_prompt_len because when
-                # a sequence is preempted, prefill includes previous generated
-                # output tokens.
-                if (token_chunk_size + num_computed_tokens
-                        < seqs[0].data.get_len()):
-                    do_sample = False
-
-            # It assumes the scheduled_seq_groups is ordered by
-            # prefill < decoding.
-            if is_first_prefill or not self.scheduler_config.send_delta_data:
-                seq_group_metadata = SequenceGroupMetadata(
-                    request_id=seq_group.request_id,
-                    is_prompt=is_prompt,
-                    seq_data=seq_data,
-                    sampling_params=seq_group.sampling_params,
-                    block_tables=block_tables,
-                    do_sample=do_sample,
-                    pooling_params=seq_group.pooling_params,
-                    token_chunk_size=token_chunk_size,
-                    lora_request=seq_group.lora_request,
-                    computed_block_nums=common_computed_block_nums,
-                    encoder_seq_data=encoder_seq_data,
-                    cross_block_table=cross_block_table,
-                    state=seq_group.state,
-                    token_type_ids=seq_group.token_type_ids,
-                    # `multi_modal_data` will only be present for the 1st comm
-                    # between engine and worker.
-                    # the subsequent comms can still use delta, but
-                    # `multi_modal_data` will be None.
-                    multi_modal_data=(seq_group.multi_modal_data
-                                      if scheduler_outputs.num_prefill_groups
-                                      > 0 else None),
-                    multi_modal_placeholders=(
-                        seq_group.multi_modal_placeholders
-                        if scheduler_outputs.num_prefill_groups > 0 else None),
-                )
-            else:
-                # When SPMD mode is enabled, we only send delta data except for
-                # the first request to reduce serialization cost.
-                seq_data_delta = {}
-                for id, data in seq_data.items():
-                    seq_data_delta[id] = data.get_delta_and_reset()
-                seq_group_metadata = SequenceGroupMetadataDelta(
-                    seq_data_delta,
-                    seq_group.request_id,
-                    block_tables,
-                    is_prompt,
-                    do_sample=do_sample,
-                    token_chunk_size=token_chunk_size,
-                    computed_block_nums=common_computed_block_nums,
-                )
-            seq_group_metadata_list.append(seq_group_metadata)
-
-            if allow_async_output_proc:
-                allow_async_output_proc = self._allow_async_output_proc(
-                    seq_group)
-
-        # Now that the batch has been created, we can assume all blocks in the
-        # batch will have been computed before the next scheduling invocation.
-        # This is because the engine assumes that a failure in model execution
-        # will crash the vLLM instance / will not retry.
-        for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
-            self.block_manager.mark_blocks_as_computed(
-                scheduled_seq_group.seq_group,
-                scheduled_seq_group.token_chunk_size)
-
-        self._seq_group_metadata_cache[self.next_cache_id].reset()
-
-        scheduler_time = time.perf_counter() - scheduler_start_time
-        # Add this to scheduler time to all the sequences that are currently
-        # running. This will help estimate if the scheduler is a significant
-        # component in the e2e latency.
-        for seq_group in self.running:
-            if seq_group is not None and seq_group.metrics is not None:
-                if seq_group.metrics.scheduler_time is not None:
-                    seq_group.metrics.scheduler_time += scheduler_time
-                else:
-                    seq_group.metrics.scheduler_time = scheduler_time
-
-        # Move to next cache (if exists)
-        self.cache_id = self.next_cache_id
-
-        # Return results
-        return (seq_group_metadata_list, scheduler_outputs,
-                allow_async_output_proc)
-
-    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
-        self.block_manager.fork(parent_seq, child_seq)
-
-    def free_seq(self, seq: Sequence) -> None:
-        """Free a sequence from a block table."""
-        self.block_manager.free(seq)
-
-    def remove_seq_from_computed_blocks_tracker(
-            self, seq_group: SequenceGroup,
-            status: Optional[SequenceStatus]) -> None:
-        seqs = seq_group.get_seqs(status=status)
-        for seq in seqs:
-            self._remove_seq_from_computed_blocks_tracker(seq)
-
-    def _remove_seq_from_computed_blocks_tracker(self, seq: Sequence) -> None:
-        """
-        Free a sequence computed blocks tracker _seq_id_to_blocks_hashes
-        and _seq_id_to_num_tokens_computed.
-        """
-        self.block_manager.remove_seq_from_computed_blocks_tracker(seq)
-
-    def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
-        """Free finished seqs in a sequence group."""
-        for seq in seq_group.get_seqs():
-            if seq.is_finished():
-                self.free_seq(seq)
-
-    def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
-        if seq_group.is_finished():
-            # Free cross-attention block table, if it exists
-            self._free_seq_group_cross_attn_blocks(seq_group)
-
-            # Add the finished requests to the finished requests list.
-            # This list will be used to update the Mamba cache in the
-            # next step.
-            self._finished_requests_ids.append(seq_group.request_id)
-
-        # Free finished seqs
-        self._free_finished_seqs(seq_group)
-
-    def free_finished_seq_groups(self) -> None:
-        remaining: Deque[SequenceGroup] = deque()
-        for seq_group in self.running:
-            self._free_finished_seq_group(seq_group)
-            if not seq_group.is_finished():
-                remaining.append(seq_group)
-
-        self.running = remaining
-
-        # Handle async stopped sequence groups
-        # (ones that reached max model len)
-        if self._async_stopped:
-            for seq_group in self._async_stopped:
-                self._free_seq_group_cross_attn_blocks(seq_group)
-                self._finished_requests_ids.append(seq_group.request_id)
-
-                # Free finished seqs
-                self._free_finished_seqs(seq_group)
-
-            self._async_stopped.clear()
-
-    def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
-        self.block_manager.allocate(seq_group)
-        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
-            seq.status = SequenceStatus.RUNNING
-
-    def _append_slots(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_copy: List[Tuple[int, int]],
-        enable_chunking: bool = False,
-    ) -> None:
-        """Appends new slots to the sequences in the given sequence group.
-
-        Args:
-            seq_group (SequenceGroup): The sequence group containing the
-                sequences to append slots to.
-            blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
-                ints, the first int is the source block index, and the second
-                int is the destination block index. This list is updated with
-                the new source and destination block indices for the appended
-                slots.
-            enable_chunking (bool): True if chunked prefill is enabled.
-        """
-        is_prefill: bool = seq_group.is_prefill()
-        num_lookahead_slots: int = self._get_num_lookahead_slots(
-            is_prefill, enable_chunking)
-
-        seq_group.init_multi_step_from_lookahead_slots(
-            num_lookahead_slots,
-            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
-            is_multi_step=self.scheduler_config.is_multi_step,
-            enable_chunking=enable_chunking,
-        )
-
-        seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
-        if self.scheduler_config.is_multi_step and enable_chunking:
-            # In multi-step chunked-prefill any sequence type can have
-            # slots appended.
-            seq_status = None
-
-        for seq in seq_group.get_seqs(status=seq_status):
-            cows = self.block_manager.append_slots(seq, num_lookahead_slots)
-            if len(cows) > 0:
-                blocks_to_copy.extend(cows)
-
-    def _preempt(self, seq_group: SequenceGroup,
-                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
-        # If preemption mode is not specified, we determine the mode as follows:
-        # We use recomputation by default since it incurs lower overhead than
-        # swapping. However, when the sequence group has multiple sequences
-        # (e.g., beam search), recomputation is not currently supported. In
-        # such a case, we use swapping instead.
-        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
-        # As swapped sequences are prioritized over waiting sequences,
-        # sequence groups with multiple sequences are implicitly prioritized
-        # over sequence groups with a single sequence.
-        # TODO(woosuk): Support recomputation for sequence groups with multiple
-        # sequences. This may require a more sophisticated CUDA kernel.
-        if self.user_specified_preemption_mode is None:
-            if seq_group.get_max_num_running_seqs() == 1:
-                preemption_mode = PreemptionMode.RECOMPUTE
-            else:
-                preemption_mode = PreemptionMode.SWAP
-
-        elif self.user_specified_preemption_mode == "swap":
-            preemption_mode = PreemptionMode.SWAP
-        else:
-            preemption_mode = PreemptionMode.RECOMPUTE
-
-        if self.num_cumulative_preemption % 50 == 0:
-            logger.warning(
-                "Sequence group %s is preempted by %s mode because there is "
-                "not enough KV cache space. This can affect the end-to-end "
-                "performance. Increase gpu_memory_utilization or "
-                "tensor_parallel_size to provide more KV cache memory. "
-                "total_num_cumulative_preemption=%d",
-                seq_group.request_id,
-                preemption_mode,
-                self.num_cumulative_preemption + 1,
-            )
-        self.num_cumulative_preemption += 1
-
-        if preemption_mode == PreemptionMode.RECOMPUTE:
-            self._preempt_by_recompute(seq_group)
-        elif preemption_mode == PreemptionMode.SWAP:
-            self._preempt_by_swap(seq_group, blocks_to_swap_out)
-        else:
-            raise AssertionError("Invalid preemption mode.")
-        return preemption_mode
-
-    def _preempt_by_recompute(
-        self,
-        seq_group: SequenceGroup,
-    ) -> None:
-        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        assert len(seqs) == 1
-        for seq in seqs:
-            seq.status = SequenceStatus.WAITING
-            self.free_seq(seq)
-            seq.reset_state_for_recompute()
-        self._free_seq_group_cross_attn_blocks(seq_group)
-
-    def _preempt_by_swap(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: List[Tuple[int, int]],
-    ) -> None:
-        self._swap_out(seq_group, blocks_to_swap_out)
-
-    def _swap_in(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_in: List[Tuple[int, int]],
-    ) -> None:
-        mapping = self.block_manager.swap_in(seq_group)
-        blocks_to_swap_in.extend(mapping)
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            seq.status = SequenceStatus.RUNNING
-
-    def _swap_out(
-        self,
-        seq_group: SequenceGroup,
-        blocks_to_swap_out: List[Tuple[int, int]],
-    ) -> None:
-        if not self.block_manager.can_swap_out(seq_group):
-            # FIXME(woosuk): Abort the sequence group instead of aborting the
-            # entire engine.
-            raise RuntimeError(
-                "Aborted due to the lack of CPU swap space. Please increase "
-                "the swap space to avoid this error.")
-        mapping = self.block_manager.swap_out(seq_group)
-        blocks_to_swap_out.extend(mapping)
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            seq.status = SequenceStatus.SWAPPED
-
-    def _passed_delay(self, now: float) -> bool:
-        if self.prev_prompt:
-            self.last_prompt_latency = now - self.prev_time
-        self.prev_time, self.prev_prompt = now, False
-        # Delay scheduling prompts to let waiting queue fill up
-        if self.scheduler_config.delay_factor > 0 and self.waiting:
-            earliest_arrival_time = min(
-                [e.metrics.arrival_time for e in self.waiting])
-            passed_delay = ((now - earliest_arrival_time)
-                            > (self.scheduler_config.delay_factor *
-                               self.last_prompt_latency) or not self.running)
-        else:
-            passed_delay = True
-        return passed_delay
-
-    def _get_num_lookahead_slots(self, is_prefill: bool,
-                                 enable_chunking: bool) -> int:
-        """The number of slots to allocate per sequence per step, beyond known
-        token ids. Speculative decoding uses these slots to store KV activations
-        of tokens which may or may not be accepted.
-
-        Speculative decoding does not yet support prefill, so we do not perform
-        lookahead allocation for prefill.
-
-        When chunking is enabled with multi-step, we allocate lookahead slots
-        for the prefills for when the prefills turn into decodes in the first
-        step.
-        """
-        if is_prefill:
-            if self.scheduler_config.is_multi_step and enable_chunking:
-                # num_lookahead_slots was introduced in the context of decodes,
-                # in Speculative Decoding.
-                # When the num_scheduler_steps is 8, say, then the
-                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
-                # decode anyways and we wish to do 7 more.
-                #
-                # "lookaheads" for prefills, is introduced in support for
-                # Chunked-Prefill in Multi-Step.
-                return self.scheduler_config.num_lookahead_slots + 1
-            else:
-                return 0
-
-        return self.scheduler_config.num_lookahead_slots
-
-    def _get_num_new_uncached_and_cached_tokens(
-        self,
-        seq_group: SequenceGroup,
-        status: SequenceStatus,
-        enable_chunking: bool,
-        budget: SchedulingBudget,
-        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
-    ) -> Tuple[int, int]:
-        """
-        Returns the number of new uncached and cached tokens to schedule for a
-        given sequence group that's in a given `status`.
-
-        The API could chunk the number of tokens to compute based on `budget`
-        if `enable_chunking` is True. If a sequence group has multiple
-        sequences (e.g., running beam search), it means it is in decoding
-        phase, so chunking doesn't happen.
-
-        Returns (0, 0) if the new token cannot be computed due to token budget.
-
-        The cached tokens's blocks are already computed, and the attention
-        backend will reuse the cached blocks rather than recomputing them. So
-        the scheduler could schedule these cached tokens "for free".
-
-        Args:
-            seq_group: The sequence group to get the number of new tokens to
-                schedule.
-            status: The status of the sequences to get the number of new tokens
-                to schedule.
-            enable_chunking: Whether to chunk the number of tokens to compute.
-            budget: The budget to chunk the number of tokens to compute.
-            partial_prefill_metadata: information about the partial prefills
-                that are currently running
-
-
-        Returns:
-            A tuple of two ints. The first int is the number of new uncached
-            tokens to schedule. The second int is the number of cached tokens.
-            If no more new tokens can be scheduled, returns (0, 0).
-        """
-        num_cached_new_tokens = 0
-        num_uncached_new_tokens = 0
-
-        seqs = seq_group.get_seqs(status=status)
-        # Compute the number of new uncached and cached tokens for
-        # each sequence.
-        for seq in seqs:
-            if not seq.is_prefill():
-                # Decode sequences should always just have 1 uncached token
-                # TODO(rickyx): Actually is this still correct for multi-step?
-                num_uncached_new_tokens += 1
-                continue
-
-            num_computed_tokens_seq = seq.get_num_computed_tokens()
-            all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
-            if not self.cache_config.enable_prefix_caching:
-                # If prefix caching is not enabled, all new tokens are uncached.
-                num_uncached_new_tokens += all_num_new_tokens_seq
-                continue
-
-            # NOTE: the cache token might be currently in a block that's in an
-            # evictor meaning that it's not yet allocated. However, we don't
-            # exclude such tokens in the cache count because it will be
-            # guaranteed to be allocated later if the sequence can be allocated.
-            num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
-                seq)
-
-            # Sanity check.
-            if num_cached_tokens_seq < num_computed_tokens_seq:
-                # This should only happen with chunked prefill, and
-                # the seq is still in prefill. The `num_cached_tokens_seq`
-                # is the value we calculated on scheduling the first prefill.
-                # For subsequent continuous prefill steps, we cached the
-                # number of cache tokens for the sequence so the cached token
-                # count could be less than the number of computed tokens.
-                # See comments on `ComputedBlocksTracker` for more details.
-                assert (
-                    seq.is_prefill() and seq.status == SequenceStatus.RUNNING
-                    and self.scheduler_config.chunked_prefill_enabled
-                ), ("Number of cached tokens should not be less than the "
-                    "number of computed tokens for a sequence that's still "
-                    f"in prefill. But there are {num_cached_tokens_seq} cached "
-                    f"tokens and {num_computed_tokens_seq} computed tokens "
-                    f"for sequence {seq.seq_id}.")
-
-            num_cached_new_tokens_seq = max(
-                0, num_cached_tokens_seq - num_computed_tokens_seq)
-            num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
-                                           num_cached_new_tokens_seq)
-
-            num_uncached_new_tokens += num_uncached_new_tokens_seq
-            num_cached_new_tokens += num_cached_new_tokens_seq
-
-        if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
-            # For a fully cached hit sequence, we actually need to recompute the
-            # last token. So we need at least 1 uncached token to schedule.
-            # See ModelRunner._compute_for_prefix_cache_hit for more details.
-            num_uncached_new_tokens = 1
-            num_cached_new_tokens -= 1
-
-        if enable_chunking and len(seqs) == 1:
-            # Chunk if a running request cannot fit in the given budget.
-            # If number of seq > 1, it means it is doing beam search
-            # in a decode phase. Do not chunk.
-            num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
-                self.scheduler_config,
-                self.cache_config,
-                budget,
-                self._get_prompt_limit(seq_group),
-                num_uncached_new_tokens,
-                self.partial_prefill_budget_lookup_list,
-                partial_prefill_metadata,
-            )
-
-        return num_uncached_new_tokens, num_cached_new_tokens
-
-    @staticmethod
-    def _chunk_new_tokens_to_schedule(
-        scheduler_config: SchedulerConfig,
-        cache_config: CacheConfig,
-        budget: SchedulingBudget,
-        prompt_limit: int,
-        num_new_tokens: int,
-        partial_prefill_budget_lookup_list: List[int],
-        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
-    ) -> int:
-        """
-        Chunks the number of new tokens to schedule based on the budget when
-        chunked prefill is enabled.
-
-        Args:
-            scheduler_config: The scheduler config.
-            cache_config: The cache config.
-            budget: The budget to chunk the number of tokens to compute.
-            prompt_limit: The maximum number of tokens allowed in a prompt.
-            num_new_tokens: The number of new tokens to schedule.
-
-        Returns:
-            The number of new tokens to schedule after chunking.
-        """
-        remaining_token_budget = budget.remaining_token_budget()
-        if scheduler_config.is_multi_step:
-            # The current multi-step + chunked prefill capability does
-            # not actually support chunking prompts.
-            #
-            # Therefore, `num_new_tokens` is computed in the same fashion
-            # for both multi-step+chunked-prefill &
-            # multi-step+chunked-prefill+APC
-            #
-            # Prompts with more tokens than the current remaining budget
-            # are postponed to future scheduler steps
-            if num_new_tokens > prompt_limit:
-                # If the seq_group is in prompt-stage, pass the
-                # num_new_tokens as-is so the caller can ignore
-                # the sequence.
-                return num_new_tokens
-
-            return 0 if num_new_tokens > \
-                remaining_token_budget else num_new_tokens
-
-        # Get the number of tokens to allocate to this prefill slot
-        prefill_slot_budget = (
-            remaining_token_budget if partial_prefill_metadata is None else
-            partial_prefill_budget_lookup_list[
-                partial_prefill_metadata.schedulable_prefills])
-
-        if cache_config.enable_prefix_caching:
-            # When prefix caching is enabled and we're partially prefilling
-            # a sequence, we always allocate a number of new tokens that is
-            # divisible by the block size to avoid partial block matching.
-            block_size = cache_config.block_size
-            # Don't exceed either the total budget or slot budget.
-            # Take min of those and get the next lowest multiple of the
-            # block size:
-            remaining_token_budget = (
-                min(remaining_token_budget, prefill_slot_budget) //
-                block_size) * block_size
-            # NB: In the case where num_new_tokens < budget, we are
-            # finishing prefill for this sequence, so we do not need to
-            # allocate a full block.
-
-        num_new_tokens = min(num_new_tokens, remaining_token_budget,
-                             prefill_slot_budget)
-
-        return num_new_tokens
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
deleted file mode 100644
index 39642d89167b..000000000000
--- a/vllm/engine/async_llm_engine.py
+++ /dev/null
@@ -1,1196 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import copy
-import time
-import weakref
-from functools import partial
-from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
-                    Mapping, Optional, Set, Tuple, Type, Union)
-from weakref import ReferenceType
-
-import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
-from vllm.core.scheduler import SchedulerOutputs
-from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_timeout import asyncio_timeout
-from vllm.engine.llm_engine import LLMEngine, SchedulerOutputState
-from vllm.engine.metrics_types import StatLoggerBase
-from vllm.engine.protocol import EngineClient
-from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import PromptType
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import ExecuteModelRequest
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Device, weak_bind
-
-logger = init_logger(__name__)
-ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
-
-
-class AsyncEngineDeadError(RuntimeError):
-    pass
-
-
-def _log_task_completion(task: asyncio.Task,
-                         error_callback: Callable[[Exception], None]) -> None:
-    """This function is only intended for the `engine.run_engine_loop()` task.
-
-    In particular, that task runs a `while True` loop that can only exit if
-    there is an exception.
-    """
-
-    exception = None
-    try:
-        return_value = task.result()
-        raise AssertionError(
-            f"The engine background task should never finish without an "
-            f"exception. {return_value}")
-    except asyncio.exceptions.CancelledError:
-        # We assume that if the task is cancelled, we are gracefully shutting
-        # down. This should only happen on program exit.
-        logger.info("Engine is gracefully shutting down.")
-    except Exception as e:
-        exception = e
-        logger.error("Engine background task failed", exc_info=e)
-        error_callback(exception)
-        raise AsyncEngineDeadError(
-            "Task finished unexpectedly. This should never happen! "
-            "Please open an issue on GitHub. See stack trace above for the "
-            "actual cause.") from e
-
-
-STOP_ITERATION = Exception()  # Sentinel
-
-
-class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
-
-    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
-        self.request_id = request_id
-        self._cancel = cancel
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._finished = False
-
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
-        if not self._finished:
-            self._queue.put_nowait(item)
-
-    def finish(
-        self,
-        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
-    ) -> None:
-        if not self._finished:
-            self._finished = True
-            self._queue.put_nowait(
-                exception if self._is_raisable(exception) else STOP_ITERATION)
-
-    @property
-    def finished(self) -> bool:
-        return self._finished
-
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        try:
-            while True:
-                result = await self._queue.get()
-                if self._is_raisable(result):
-                    if result == STOP_ITERATION:
-                        return
-                    raise result
-                yield result
-        except GeneratorExit:
-            self._cancel(self.request_id)
-            raise asyncio.CancelledError from None
-
-    @staticmethod
-    def _is_raisable(value: Any):
-        return isinstance(value, BaseException) or \
-                (isinstance(value, type) and \
-                 issubclass(value, BaseException))
-
-
-class RequestTracker:
-    """Synchronous abstraction for tracking requests."""
-
-    def __init__(self) -> None:
-        self._request_streams: Dict[str, AsyncStream] = {}
-        self._aborted_requests: asyncio.Queue[str] = asyncio.Queue()
-        self._new_requests: asyncio.Queue[Tuple[AsyncStream,
-                                                dict]] = asyncio.Queue()
-        self.new_requests_event = asyncio.Event()
-
-    def __contains__(self, item):
-        return item in self._request_streams
-
-    def __len__(self) -> int:
-        return len(self._request_streams)
-
-    def propagate_exception(self,
-                            exc: Exception,
-                            request_id: Optional[str] = None) -> None:
-        """Propagate an exception to request streams
-        (all if request_id is None)."""
-        if request_id is not None:
-            self.abort_request(request_id, exception=exc)
-        else:
-            # NB: tuple() used here because self.abort_request pops the stream
-            # out of self._request_streams, so we can't iterate on it directly
-            for rid in tuple(self._request_streams.keys()):
-                self.abort_request(rid, exception=exc)
-
-    def process_request_output(self,
-                               request_output: Union[RequestOutput,
-                                                     PoolingRequestOutput],
-                               *,
-                               verbose: bool = False) -> None:
-        """Process a request output from the engine."""
-        request_id = request_output.request_id
-        finished = request_output.finished
-
-        if finished:
-            stream = self._request_streams.pop(request_id, None)
-        else:
-            stream = self._request_streams.get(request_id)
-        # Guard against a KeyError which can occur if the request was aborted
-        # while the output was generated
-        if stream is not None:
-            stream.put(request_output)
-            if finished:
-                stream.finish()
-
-        if verbose and finished:
-            logger.info("Finished request %s.", request_id)
-
-    def process_exception(self,
-                          request_id: str,
-                          exception: BaseException,
-                          *,
-                          verbose: bool = False) -> None:
-        """Propagate an exception from the engine."""
-        if verbose:
-            logger.info("Finished request %s.", request_id)
-        self.abort_request(request_id, exception=exception)
-
-    def add_request(self,
-                    request_id: str,
-                    *,
-                    verbose: bool = False,
-                    **engine_add_request_kwargs) -> AsyncStream:
-        """Add a request to be sent to the engine on the next background
-        loop iteration."""
-        if request_id in self._request_streams:
-            raise KeyError(f"Request {request_id} already exists.")
-
-        abort_request = partial(self.abort_request, verbose=verbose)
-        stream = AsyncStream(request_id, abort_request)
-        self._new_requests.put_nowait((stream, {
-            "request_id": request_id,
-            **engine_add_request_kwargs
-        }))
-
-        self.new_requests_event.set()
-
-        if verbose:
-            logger.info("Added request %s.", request_id)
-
-        return stream
-
-    def abort_request(self,
-                      request_id: str,
-                      *,
-                      exception: Optional[Union[BaseException,
-                                                Type[BaseException]]] = None,
-                      verbose: bool = False) -> None:
-        """Abort a request during next background loop iteration."""
-        if verbose:
-            logger.info("Aborted request %s.", request_id)
-
-        self._aborted_requests.put_nowait(request_id)
-
-        stream = self._request_streams.pop(request_id, None)
-        if stream is not None:
-            stream.finish(exception=exception)
-
-    def get_new_and_aborted_requests(self) -> Tuple[List[Dict], Set[str]]:
-        """Get the new requests and finished requests to be
-        sent to the engine."""
-        new_requests: List[Dict] = []
-        finished_requests: Set[str] = set()
-
-        while not self._aborted_requests.empty():
-            request_id = self._aborted_requests.get_nowait()
-            finished_requests.add(request_id)
-
-        while not self._new_requests.empty():
-            stream, new_request = self._new_requests.get_nowait()
-            request_id = stream.request_id
-            if request_id in finished_requests:
-                # The request has already been aborted.
-                stream.finish(asyncio.CancelledError)
-                finished_requests.discard(request_id)
-            else:
-                self._request_streams[request_id] = stream
-                new_requests.append(new_request)
-
-        return new_requests, finished_requests
-
-    async def wait_for_new_requests(self):
-        if not self.has_new_requests():
-            await self.new_requests_event.wait()
-        self.new_requests_event.clear()
-
-    def has_new_requests(self):
-        return not self._new_requests.empty()
-
-
-class _AsyncLLMEngine(LLMEngine):
-    """Extension of LLMEngine to add async methods."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    async def step_async(
-        self, virtual_engine: int
-    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
-        """Performs one decoding iteration and returns newly generated results.
-        The workers are ran asynchronously if possible.
-
-        This function performs one decoding iteration of the engine. It first
-        schedules the sequences to be executed in the next iteration and the
-        token blocks to be swapped in/out/copy. Then, it executes the model
-        and updates the scheduler with the model outputs. Finally, it decodes
-        the sequences and returns the newly generated results.
-        """
-        # these are cached outputs from previous iterations. None if on first
-        # iteration
-        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
-        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
-        scheduler_outputs = cached_outputs.scheduler_outputs
-        allow_async_output_proc = cached_outputs.allow_async_output_proc
-
-        ctx = self.scheduler_contexts[virtual_engine]
-
-        # Clear outputs for each new scheduler iteration
-        ctx.request_outputs.clear()
-
-        # skip the scheduler if there are any remaining steps in the seq groups.
-        # This ensures that the scheduler is only called again when the current
-        # batch has completed.
-        if not self._has_remaining_steps(seq_group_metadata_list):
-
-            # Schedule iteration
-            (seq_group_metadata_list, scheduler_outputs,
-             allow_async_output_proc
-             ) = self.scheduler[virtual_engine].schedule()
-
-            ctx.seq_group_metadata_list = seq_group_metadata_list
-            ctx.scheduler_outputs = scheduler_outputs
-
-            if not scheduler_outputs.is_empty():
-                # this will cause mamba_cache/minimax_cache failed
-                # to release finished_requests_ids of the last steps
-                finished_requests_ids = self.scheduler[
-                    virtual_engine].get_and_reset_finished_requests_ids()
-
-            # Maybe switch from async mode to sync mode
-            if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
-        else:
-            finished_requests_ids = list()
-
-        assert seq_group_metadata_list is not None
-        assert scheduler_outputs is not None
-
-        if not scheduler_outputs.is_empty():
-
-            # Check if we have a cached last_output from the previous iteration.
-            # For supporting PP this is probably the best way to pass the
-            # sampled_token_ids, as a separate broadcast over all the PP stages
-            # will cause one virtual engine's microbatch to block the pipeline.
-            last_sampled_token_ids = \
-                self._get_last_sampled_token_ids(virtual_engine)
-
-            execute_model_req = ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
-                blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                virtual_engine=virtual_engine,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
-                running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids,
-                # We use ExecuteModelRequest to pass the last sampled_token_ids
-                # to each of the non-last PP stages for in-place prepare_input.
-                last_sampled_token_ids=last_sampled_token_ids)
-
-            if allow_async_output_proc:
-                execute_model_req.async_callback = self.async_callbacks[
-                    virtual_engine]
-
-            # Execute the model.
-            outputs = await self.model_executor.execute_model_async(
-                execute_model_req)
-
-            # we need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
-        else:
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            outputs = []
-
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
-        if not self._has_remaining_steps(seq_group_metadata_list):
-            # Clear the cache if we have finished all the steps
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[
-                    virtual_engine] = SchedulerOutputState()
-
-            # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
-            is_first_step_output: bool = False if not seq_group_metadata_list \
-                else seq_group_metadata_list[0].state.num_steps == 1
-
-            ctx.append_output(outputs=outputs,
-                              seq_group_metadata_list=seq_group_metadata_list,
-                              scheduler_outputs=scheduler_outputs,
-                              is_async=allow_async_output_proc,
-                              is_last_step=True,
-                              is_first_step_output=is_first_step_output)
-
-            if outputs and allow_async_output_proc:
-                assert len(
-                    outputs
-                ) == 1, "Async postprocessor expects only a single output set"
-                self._advance_to_next_step(
-                    outputs[0], seq_group_metadata_list,
-                    scheduler_outputs.scheduled_seq_groups)
-
-            if not allow_async_output_proc:
-                self._process_model_outputs(ctx=ctx)
-
-                # Log stats.
-                self.do_log_stats(scheduler_outputs, outputs)
-
-                # Tracing
-                self.do_tracing(scheduler_outputs)
-
-        else:
-            # Multi-step case
-            return ctx.request_outputs
-
-        if not self.has_unfinished_requests():
-            # Drain async postprocessor (if exists)
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            assert len(ctx.output_queue) == 0
-
-        return ctx.request_outputs
-
-    async def stop_remote_worker_execution_loop_async(self) -> None:
-        """Stop the remote worker execution loop."""
-        await self.model_executor.stop_remote_worker_execution_loop_async()
-
-    async def get_tokenizer_async(self,
-                                  lora_request: Optional[LoRARequest] = None
-                                  ) -> AnyTokenizer:
-        return await (
-            self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
-
-    async def add_request_async(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> None:
-        """
-        Async version of
-        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
-        """
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-        if priority != 0 and not self.scheduler_config.policy == "priority":
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        if data_parallel_rank is not None:
-            raise ValueError("Targeting data_parallel_rank only supported "
-                             "in v1 client.")
-
-        if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None
-                and not prompt.get("prompt_token_ids", None)):
-            # We use the -2 dimension (instead of 0) in case a batched input
-            # of batch size 1 is passed in.
-            prompt["prompt_token_ids"] = [0
-                                          ] * prompt["prompt_embeds"].shape[-2]
-
-        processed_inputs = await self.input_preprocessor.preprocess_async(
-            prompt,
-            lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            # Guided decoding has an async implementation for building logits
-            # processors in a separate threadpool.
-            # We want to invoke that here instead of using the blocking
-            # implementation in the LLMEngine
-            params = await build_guided_decoding_logits_processor_async(
-                sampling_params=params,
-                tokenizer=await self.get_tokenizer_async(lora_request),
-                default_guided_backend=self.decoding_config.backend,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-                model_config=self.model_config)
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-
-    async def check_health_async(self) -> None:
-        self.model_executor.check_health()
-
-    async def collective_rpc_async(self,
-                                   method: str,
-                                   timeout: Optional[float] = None,
-                                   args: tuple = (),
-                                   kwargs: Optional[dict] = None):
-        raise NotImplementedError
-
-
-async def build_guided_decoding_logits_processor_async(
-        sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str, reasoning_backend: Optional[str],
-        model_config: ModelConfig) -> SamplingParams:
-    """Constructs logits processors based on the guided_decoding,
-    logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-    those fields and adds the constructed logits processors to the
-    logits_processors field. Modifies sampling params in-place and returns
-    the modified sampling params."""
-    if sampling_params.guided_decoding is None:
-        return sampling_params
-
-    # Defensively copy sampling params since guided decoding logits
-    # processors can have different state for each request
-    sampling_params = copy.copy(sampling_params)
-    guided_decoding = sampling_params.guided_decoding
-
-    logger.debug(
-        "Building guided decoding logits processor. "
-        "guided_decoding: %s%s", guided_decoding,
-        f", reasoning_backend: {reasoning_backend}"
-        if reasoning_backend is not None else "")
-
-    guided_decoding.backend = guided_decoding.backend or default_guided_backend
-
-    processor = await get_guided_decoding_logits_processor(
-        guided_params=guided_decoding,
-        tokenizer=tokenizer,
-        reasoning_backend=reasoning_backend,
-        model_config=model_config)
-
-    if processor:
-        if sampling_params.logits_processors is None:
-            sampling_params.logits_processors = []
-        sampling_params.logits_processors.append(processor)
-
-    # Unset guided decoding params after constructing the lp from them
-    sampling_params.guided_decoding = None
-
-    return sampling_params
-
-
-class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
-
-    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
-    make it asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
-    by the generate method when there are requests in the waiting queue. The
-    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
-    to the caller.
-
-    Args:
-        log_requests: Whether to log the requests.
-        start_engine_loop: If True, the background task to run the engine
-            will be automatically started in the generate call.
-        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
-        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
-    """
-
-    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
-
-    def __init__(self,
-                 *args,
-                 log_requests: bool = True,
-                 start_engine_loop: bool = True,
-                 **kwargs) -> None:
-        if envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
-                "This should not happen. As a workaround, try using "
-                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        self.log_requests = log_requests
-        self.engine = self._engine_class(*args, **kwargs)
-
-        # This ensures quick processing of request outputs
-        # so the append to asyncio queues is not delayed,
-        # especially for multi-step.
-        self.use_process_request_outputs_callback = (
-            self.engine.model_config.use_async_output_proc)
-
-        if self.use_process_request_outputs_callback:
-            self.engine.process_request_outputs_callback = \
-                weak_bind(self.process_request_outputs)
-
-        self.background_loop: Optional[asyncio.Future] = None
-        # We need to keep a reference to unshielded
-        # task as well to prevent it from being garbage
-        # collected
-        self._background_loop_unshielded: Optional[asyncio.Task] = None
-        self.start_engine_loop = start_engine_loop
-        self._errored_with: Optional[BaseException] = None
-
-        # Lazy initialized fields
-        self._request_tracker: RequestTracker
-
-    def __del__(self):
-        if rt := getattr(self, "request_tracker", None):
-            # Wake up engine loop so that it will exit cleanly
-            rt.new_requests_event.set()
-
-    @classmethod
-    def _get_executor_cls(cls,
-                          engine_config: VllmConfig) -> Type[ExecutorBase]:
-        return LLMEngine._get_executor_cls(engine_config)
-
-    @classmethod
-    def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        disable_log_requests: bool = False,
-        disable_log_stats: bool = False,
-    ) -> "AsyncLLMEngine":
-        """Create an AsyncLLMEngine from the EngineArgs."""
-
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=cls._get_executor_cls(vllm_config),
-            start_engine_loop=start_engine_loop,
-            log_requests=not disable_log_requests,
-            log_stats=not disable_log_stats,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-
-    @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: AsyncEngineArgs,
-        start_engine_loop: bool = True,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "AsyncLLMEngine":
-        """Creates an async LLM engine from the engine arguments."""
-
-        vllm_config = engine_args.create_engine_config(usage_context)
-
-        async_engine_cls = cls
-        if envs.VLLM_USE_V1:
-            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
-            async_engine_cls = V1AsyncLLMEngine
-
-        return async_engine_cls.from_vllm_config(
-            vllm_config=vllm_config,
-            start_engine_loop=start_engine_loop,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-            disable_log_stats=engine_args.disable_log_stats,
-            disable_log_requests=engine_args.disable_log_requests,
-        )
-
-    @property
-    def is_running(self) -> bool:
-        return (self.background_loop is not None
-                and self._background_loop_unshielded is not None
-                and not self._background_loop_unshielded.done())
-
-    @property
-    def is_stopped(self) -> bool:
-        return self.errored or (self.background_loop is not None and
-                                self._background_loop_unshielded is not None
-                                and self._background_loop_unshielded.done())
-
-    @property
-    def errored(self) -> bool:
-        return self._errored_with is not None
-
-    @property
-    def dead_error(self) -> BaseException:
-        return AsyncEngineDeadError(
-            "Background loop is not running. If it was running, "
-            "inspect the output to find the stacktrace of the "
-            "error that caused the background loop to stop "
-            "(AsyncEngineDeadError).")
-
-    def set_errored(self, exc: Exception) -> None:
-        self._errored_with = exc
-
-    def _error_callback(self, exc: Exception) -> None:
-        self.set_errored(exc)
-        self._request_tracker.propagate_exception(exc)
-
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        return self.engine.input_preprocessor
-
-    async def get_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return await self.engine.get_tokenizer_async(lora_request)
-
-    def start_background_loop(self) -> None:
-        """Start the background loop."""
-        if self.errored:
-            raise AsyncEngineDeadError(
-                "Background loop has errored already.") from self._errored_with
-        if self.is_running:
-            raise RuntimeError("Background loop is already running.")
-        # Initialize the RequestTracker here so it uses the right event loop.
-        self._request_tracker = RequestTracker()
-
-        self._background_loop_unshielded = asyncio.get_event_loop(
-        ).create_task(self.run_engine_loop(weakref.ref(self)))
-        self._background_loop_unshielded.add_done_callback(
-            partial(_log_task_completion, error_callback=self._error_callback))
-        self.background_loop = asyncio.shield(self._background_loop_unshielded)
-
-    def shutdown_background_loop(self) -> None:
-        """
-        Shut down the background loop.
-
-        This method needs to be called during cleanup to remove
-        references to `self` and properly GC the resources held
-        by the async LLM engine (e.g., the executors as well as
-        their resources).
-        """
-        if self._background_loop_unshielded is not None:
-            self._background_loop_unshielded.cancel()
-            self._background_loop_unshielded = None
-        self.background_loop = None
-
-    async def engine_step(self, virtual_engine: int) -> bool:
-        """Kick the engine to process the waiting requests.
-
-        Returns True if there are in-progress requests."""
-
-        new_requests, aborted_requests = (
-            self._request_tracker.get_new_and_aborted_requests())
-
-        for new_request in new_requests:
-            # Add the request into the vLLM engine's waiting queue.
-            try:
-                await self.engine.add_request_async(**new_request)
-            except ValueError as e:
-                # TODO: use a vLLM specific error for failed validation
-                self._request_tracker.process_exception(
-                    new_request["request_id"],
-                    e,
-                    verbose=self.log_requests,
-                )
-
-        if aborted_requests:
-            await self._engine_abort(aborted_requests)
-
-        request_outputs = await self.engine.step_async(virtual_engine)
-
-        # Put the outputs into the corresponding streams.
-        # If used as a callback, then already invoked inside
-        # LLMEngine's _process_model_outputs
-        if not self.use_process_request_outputs_callback:
-            all_finished = self.process_request_outputs(request_outputs)
-        else:
-            # For callback case, we only need to detect when all
-            # requests are finished
-            all_finished = all(request_output.finished
-                               for request_output in request_outputs)
-
-        return not all_finished
-
-    def process_request_outputs(self, request_outputs) -> bool:
-        # Put the outputs into the corresponding streams.
-        all_finished = True
-        for request_output in request_outputs:
-            self._request_tracker.process_request_output(
-                request_output, verbose=self.log_requests)
-            all_finished = all_finished and request_output.finished
-
-        return all_finished
-
-    async def _engine_abort(self, request_ids: Iterable[str]):
-        self.engine.abort_request(request_ids)
-
-    @staticmethod
-    async def run_engine_loop(engine_ref: ReferenceType):
-        """We use a weakref to the engine so that the running loop
-        doesn't prevent the engine being garbage collected."""
-        engine: Optional[AsyncLLMEngine] = engine_ref()
-        if not engine:
-            return
-
-        pipeline_parallel_size = \
-                engine.engine.parallel_config.pipeline_parallel_size
-        has_requests_in_progress = [False] * pipeline_parallel_size
-        while True:
-            if not any(has_requests_in_progress):
-                logger.debug("Waiting for new requests...")
-                # Stop the execute model loop in parallel workers until there
-                # are more requests to process. This avoids waiting
-                # indefinitely in torch.distributed ops which may otherwise
-                # timeout, and unblocks the RPC thread in the workers so that
-                # they can process any other queued control plane messages,
-                # such as add/remove lora adapters.
-                await engine.engine.stop_remote_worker_execution_loop_async()
-                request_tracker = engine._request_tracker
-                # Allow engine to be garbage collected while
-                # waiting for new requests
-                del engine
-                await asyncio.sleep(0)
-                if engine_ref() is None:
-                    return
-                await request_tracker.wait_for_new_requests()
-                engine = engine_ref()
-                if not engine:
-                    return
-                logger.debug("Got new requests!")
-                requests_in_progress = [
-                    asyncio.create_task(engine.engine_step(ve))
-                    for ve in range(pipeline_parallel_size)
-                ]
-                has_requests_in_progress = [True] * pipeline_parallel_size
-
-            # Abort if iteration takes too long due to unrecoverable errors
-            # (eg. NCCL timeouts).
-            try:
-                async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
-                    done, _ = await asyncio.wait(
-                        requests_in_progress,
-                        return_when=asyncio.FIRST_COMPLETED)
-                    for _ in range(pipeline_parallel_size):
-                        await asyncio.sleep(0)
-                for task in done:
-                    result = task.result()
-                    virtual_engine = requests_in_progress.index(task)
-                    has_unfinished_requests = (
-                        engine.engine.
-                        has_unfinished_requests_for_virtual_engine(
-                            virtual_engine))
-                    if result or has_unfinished_requests:
-                        requests_in_progress[virtual_engine] = (
-                            asyncio.create_task(
-                                engine.engine_step(virtual_engine)))
-                        has_requests_in_progress[virtual_engine] = True
-                    else:
-                        has_requests_in_progress[virtual_engine] = False
-            except asyncio.TimeoutError as exc:
-                logger.error(
-                    "Engine iteration timed out. This should never happen!")
-                engine.set_errored(exc)
-                raise
-            await asyncio.sleep(0)
-
-    async def add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
-        if not self.is_running:
-            if self.start_engine_loop:
-                self.start_background_loop()
-            else:
-                raise AsyncEngineDeadError(
-                    "Background loop is not running. If it was running, "
-                    "inspect the output to find the stacktrace of the "
-                    "error that caused the background loop to stop "
-                    "(AsyncEngineDeadError).")
-
-        if (priority != 0
-                and not self.engine.scheduler_config.policy == "priority"):
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-
-        stream = self._request_tracker.add_request(
-            request_id,
-            verbose=self.log_requests,
-            prompt=prompt,
-            params=params,
-            arrival_time=arrival_time or time.time(),
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            tokenization_kwargs=tokenization_kwargs,
-        )
-
-        return stream.generator()
-
-    async def generate(
-        self,
-        prompt: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        data_parallel_rank: Optional[int] = None,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        """Generate outputs for a request.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            sampling_params: The sampling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-            data_parallel_rank: The (global) data parallel rank that must
-                handle this request. Only applicable if DP is enabled.
-        Yields:
-            The output `RequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-              which iteratively invokes
-              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
-              to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-              On the next background loop, this request will be sent to
-              the underlying engine.
-              Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-            >>> # Please refer to entrypoints/api_server.py for
-            >>> # the complete example.
-            >>>
-            >>> # initialize the engine and the example input
-            >>> # note that engine_args here is AsyncEngineArgs instance
-            >>> engine = AsyncLLMEngine.from_engine_args(engine_args)
-            >>> example_input = {
-            >>>     "prompt": "What is LLM?",
-            >>>     "stream": False, # assume the non-streaming case
-            >>>     "temperature": 0.0,
-            >>>     "request_id": 0,
-            >>> }
-            >>>
-            >>> # start the generation
-            >>> results_generator = engine.generate(
-            >>>    example_input["prompt"],
-            >>>    SamplingParams(temperature=example_input["temperature"]),
-            >>>    example_input["request_id"])
-            >>>
-            >>> # get the results
-            >>> final_output = None
-            >>> async for request_output in results_generator:
-            >>>     if await request.is_disconnected():
-            >>>         # Abort the request if the client disconnects.
-            >>>         await engine.abort(request_id)
-            >>>         # Return or raise an error
-            >>>         ...
-            >>>     final_output = request_output
-            >>>
-            >>> # Process and return the final output
-            >>> ...
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    sampling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    data_parallel_rank=data_parallel_rank,
-            ):
-                yield LLMEngine.validate_output(output, RequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
-
-    async def encode(
-        self,
-        prompt: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-                which iteratively invokes
-                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
-                to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-                On the next background loop, this request will be sent to
-                the underlying engine.
-                Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-        ```
-        # Please refer to entrypoints/api_server.py for
-        # the complete example.
-    
-        # initialize the engine and the example input
-        # note that engine_args here is AsyncEngineArgs instance
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        example_input = {
-            "input": "What is LLM?",
-            "request_id": 0,
-        }
-    
-        # start the generation
-        results_generator = engine.encode(
-        example_input["input"],
-        PoolingParams(),
-        example_input["request_id"])
-    
-        # get the results
-        final_output = None
-        async for request_output in results_generator:
-            if await request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await engine.abort(request_id)
-                # Return or raise an error
-                ...
-            final_output = request_output
-    
-        # Process and return the final output
-        ...
-        ```
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    pooling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    tokenization_kwargs=tokenization_kwargs,
-            ):
-                yield LLMEngine.validate_output(output, PoolingRequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
-
-    async def abort(self, request_id: str) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        if not self.is_running:
-            raise AsyncEngineDeadError(
-                "Background loop is not running. If it was running, "
-                "inspect the output to find the stacktrace of the "
-                "error that caused the background loop to stop "
-                "(AsyncEngineDeadError).")
-
-        return self._abort(request_id)
-
-    def _abort(self, request_id: str) -> None:
-        """Abort a request.
-
-        Abort a submitted request. If the request is finished or not found,
-        this method will be a no-op.
-
-        Args:
-            request_id: The unique id of the request.
-        """
-        self._request_tracker.abort_request(request_id,
-                                            exception=asyncio.CancelledError,
-                                            verbose=self.log_requests)
-
-    async def get_vllm_config(self) -> VllmConfig:
-        """Get the vllm configuration of the vLLM engine."""
-        return self.engine.get_vllm_config()
-
-    async def get_model_config(self) -> ModelConfig:
-        """Get the model configuration of the vLLM engine."""
-        return self.engine.get_model_config()
-
-    async def get_parallel_config(self) -> ParallelConfig:
-        """Get the parallel configuration of the vLLM engine."""
-        return self.engine.get_parallel_config()
-
-    async def get_decoding_config(self) -> DecodingConfig:
-        """Get the decoding configuration of the vLLM engine."""
-        return self.engine.get_decoding_config()
-
-    async def get_scheduler_config(self) -> SchedulerConfig:
-        """Get the scheduling configuration of the vLLM engine."""
-        return self.engine.get_scheduler_config()
-
-    async def get_lora_config(self) -> LoRAConfig:
-        """Get the lora configuration of the vLLM engine."""
-        return self.engine.get_lora_config()
-
-    async def do_log_stats(
-            self,
-            scheduler_outputs: Optional[SchedulerOutputs] = None,
-            model_output: Optional[List[SamplerOutput]] = None) -> None:
-        self.engine.do_log_stats()
-
-    async def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        t = time.perf_counter()
-        logger.debug("Starting health check...")
-        if self.is_stopped:
-            raise AsyncEngineDeadError("Background loop is stopped.")
-
-        await self.engine.check_health_async()
-        logger.debug("Health check took %fs", time.perf_counter() - t)
-
-    async def is_tracing_enabled(self) -> bool:
-        return self.engine.is_tracing_enabled()
-
-    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        self.engine.add_logger(logger_name=logger_name, logger=logger)
-
-    def remove_logger(self, logger_name: str) -> None:
-        self.engine.remove_logger(logger_name=logger_name)
-
-    async def start_profile(self) -> None:
-        self.engine.start_profile()
-
-    async def stop_profile(self) -> None:
-        self.engine.stop_profile()
-
-    async def reset_mm_cache(self) -> None:
-        self.engine.reset_mm_cache()
-
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
-        self.engine.reset_prefix_cache(device)
-
-    async def sleep(self, level: int = 1) -> None:
-        self.engine.sleep(level)
-
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up(tags)
-
-    async def is_sleeping(self) -> bool:
-        return self.engine.is_sleeping()
-
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine.add_lora(lora_request)
-
-    async def collective_rpc(self,
-                             method: str,
-                             timeout: Optional[float] = None,
-                             args: tuple = (),
-                             kwargs: Optional[dict] = None):
-        """
-        Perform a collective RPC call to the given path.
-        """
-        return await self.engine.collective_rpc_async(method, timeout, args,
-                                                      kwargs)
-
-
-# TODO(v1): Remove this class proxy when V1 goes default.
-if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-    from vllm.v1.engine.async_llm import AsyncLLM
-
-    AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py
deleted file mode 100644
index 28a023a71ef5..000000000000
--- a/vllm/engine/async_timeout.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Workaround for https://github.com/python/cpython/issues/86296
-#
-# From https://github.com/aio-libs/async-timeout/blob/master/async_timeout/__init__.py
-# Licensed under the Apache License (Apache-2.0)
-
-import asyncio
-import enum
-import sys
-from types import TracebackType
-from typing import Any, Optional, Type
-
-if sys.version_info[:2] >= (3, 11):
-    from asyncio import timeout as asyncio_timeout
-else:
-
-    def asyncio_timeout(delay: Optional[float]) -> "Timeout":
-        """timeout context manager.
-        Useful in cases when you want to apply timeout logic around block
-        of code or in cases when asyncio.wait_for is not suitable. For example:
-        >>> async with timeout(0.001):
-        ...     async with aiohttp.get('https://github.com') as r:
-        ...         await r.text()
-        delay - value in seconds or None to disable timeout logic
-        """
-        loop = asyncio.get_running_loop()
-        deadline = loop.time() + delay if delay is not None else None
-        return Timeout(deadline, loop)
-
-    class _State(enum.Enum):
-        INIT = "INIT"
-        ENTER = "ENTER"
-        TIMEOUT = "TIMEOUT"
-        EXIT = "EXIT"
-
-    class Timeout:
-        # Internal class, please don't instantiate it directly
-        # Use timeout() and timeout_at() public factories instead.
-        #
-        # Implementation note: `async with timeout()` is preferred
-        # over `with timeout()`.
-        # While technically the Timeout class implementation
-        # doesn't need to be async at all,
-        # the `async with` statement explicitly points that
-        # the context manager should be used from async function context.
-        #
-        # This design allows to avoid many silly misusages.
-        #
-        # TimeoutError is raised immediately when scheduled
-        # if the deadline is passed.
-        # The purpose is to time out as soon as possible
-        # without waiting for the next await expression.
-
-        __slots__ = ("_deadline", "_loop", "_state", "_timeout_handler")
-
-        def __init__(self, deadline: Optional[float],
-                     loop: asyncio.AbstractEventLoop) -> None:
-            self._loop = loop
-            self._state = _State.INIT
-
-            self._timeout_handler = None  # type: Optional[asyncio.Handle]
-            if deadline is None:
-                self._deadline = None  # type: Optional[float]
-            else:
-                self.update(deadline)
-
-        async def __aenter__(self) -> "Timeout":
-            self._do_enter()
-            return self
-
-        async def __aexit__(
-            self,
-            exc_type: Optional[Type[BaseException]],
-            exc_val: Optional[BaseException],
-            exc_tb: Optional[TracebackType],
-        ) -> Optional[bool]:
-            self._do_exit(exc_type)
-            return None
-
-        @property
-        def expired(self) -> bool:
-            """Is timeout expired during execution?"""
-            return self._state == _State.TIMEOUT
-
-        @property
-        def deadline(self) -> Optional[float]:
-            return self._deadline
-
-        def reject(self) -> None:
-            """Reject scheduled timeout if any."""
-            # cancel is maybe better name but
-            # task.cancel() raises CancelledError in asyncio world.
-            if self._state not in (_State.INIT, _State.ENTER):
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._reject()
-
-        def _reject(self) -> None:
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-                self._timeout_handler = None
-
-        def shift(self, delay: float) -> None:
-            """Advance timeout on delay seconds.
-            The delay can be negative.
-            Raise RuntimeError if shift is called when deadline is not scheduled
-            """
-            deadline = self._deadline
-            if deadline is None:
-                raise RuntimeError(
-                    "cannot shift timeout if deadline is not scheduled")
-            self.update(deadline + delay)
-
-        def update(self, deadline: float) -> None:
-            """Set deadline to absolute value.
-            deadline argument points on the time in the same clock system
-            as loop.time().
-            If new deadline is in the past the timeout is raised immediately.
-            Please note: it is not POSIX time but a time with
-            undefined starting base, e.g. the time of the system power on.
-            """
-            if self._state == _State.EXIT:
-                raise RuntimeError(
-                    "cannot reschedule after exit from context manager")
-            if self._state == _State.TIMEOUT:
-                raise RuntimeError("cannot reschedule expired timeout")
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-            self._deadline = deadline
-            if self._state != _State.INIT:
-                self._reschedule()
-
-        def _reschedule(self) -> None:
-            assert self._state == _State.ENTER
-            deadline = self._deadline
-            if deadline is None:
-                return
-
-            now = self._loop.time()
-            if self._timeout_handler is not None:
-                self._timeout_handler.cancel()
-
-            task = asyncio.current_task()
-            if deadline <= now:
-                self._timeout_handler = self._loop.call_soon(
-                    self._on_timeout, task)
-            else:
-                self._timeout_handler = self._loop.call_at(
-                    deadline, self._on_timeout, task)
-
-        def _do_enter(self) -> None:
-            if self._state != _State.INIT:
-                raise RuntimeError(f"invalid state {self._state.value}")
-            self._state = _State.ENTER
-            self._reschedule()
-
-        def _do_exit(self, exc_type: Optional[Type[BaseException]]) -> None:
-            if exc_type is asyncio.CancelledError and \
-                    self._state == _State.TIMEOUT:
-                self._timeout_handler = None
-                raise asyncio.TimeoutError
-            # timeout has not expired
-            self._state = _State.EXIT
-            self._reject()
-            return None
-
-        def _on_timeout(self, task: "Optional[asyncio.Task[Any]]") -> None:
-            if task:
-                task.cancel()
-            self._state = _State.TIMEOUT
-            # drop the reference early
-            self._timeout_handler = None
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
deleted file mode 100644
index e7919d90442f..000000000000
--- a/vllm/engine/llm_engine.py
+++ /dev/null
@@ -1,2061 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import copy
-import time
-from collections import Counter as collectionsCounter
-from collections import deque
-from contextlib import contextmanager
-from dataclasses import dataclass
-from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
-                    Iterable, List, Literal, Mapping, NamedTuple, Optional)
-from typing import Sequence as GenericSequence
-from typing import Set, Type, Union, cast
-
-import torch
-from typing_extensions import TypeVar
-
-import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
-from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.metrics_types import StatLoggerBase, Stats
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.engine.output_processor.util import create_output_by_sequence_group
-from vllm.entrypoints.openai.logits_processors import (
-    get_logits_processors as get_openai_logits_processors)
-from vllm.executor.executor_base import ExecutorBase
-from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-from vllm.inputs.parse import split_enc_dec_inputs
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.logits_process import get_bad_words_logits_processors
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.guided_decoding import (
-    get_local_guided_decoding_logits_processor)
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.outputs import (PoolingRequestOutput, RequestOutput,
-                          RequestOutputFactory)
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
-                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
-                           SequenceGroupBase, SequenceGroupMetadata,
-                           SequenceGroupOutput, SequenceStatus)
-from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
-                          init_tracer)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizer_group import (
-    TokenizerGroup, init_tokenizer_from_configs)
-from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
-                                  usage_message)
-from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind
-from vllm.version import __version__ as VLLM_VERSION
-from vllm.worker.model_runner_base import InputProcessingError
-
-logger = init_logger(__name__)
-_LOCAL_LOGGING_INTERVAL_SEC = 5
-
-_O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
-_R = TypeVar("_R", default=Any)
-
-
-@dataclass
-class SchedulerOutputState:
-    """Caches the scheduler outputs for a virtual engine. Used for Multi-Step"""
-    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
-    scheduler_outputs: Optional[SchedulerOutputs] = None
-    allow_async_output_proc: bool = False
-    last_output: Optional[SamplerOutput] = None
-
-
-class OutputData(NamedTuple):
-    outputs: List[SamplerOutput]
-    seq_group_metadata_list: List[SequenceGroupMetadata]
-    scheduler_outputs: SchedulerOutputs
-    is_async: bool
-    is_last_step: bool
-    # Indicates if this output is from the first step of the
-    # multi-step. When multi-step is disabled, this is always
-    # set to True.
-    # is_first_step_output is invalid when `outputs` has
-    # outputs from multiple steps.
-    is_first_step_output: Optional[bool]
-    skip: List[int]
-
-
-class SchedulerContext:
-
-    def __init__(self, multi_step_stream_outputs: bool = False):
-        self.output_queue: Deque[OutputData] = deque()
-        self.request_outputs: List[Union[RequestOutput,
-                                         PoolingRequestOutput]] = []
-        self.seq_group_metadata_list: Optional[
-            List[SequenceGroupMetadata]] = None
-        self.scheduler_outputs: Optional[SchedulerOutputs] = None
-
-        self.multi_step_stream_outputs: bool = multi_step_stream_outputs
-
-    def append_output(self, outputs: List[SamplerOutput],
-                      seq_group_metadata_list: List[SequenceGroupMetadata],
-                      scheduler_outputs: SchedulerOutputs, is_async: bool,
-                      is_last_step: bool,
-                      is_first_step_output: Optional[bool]):
-        self.output_queue.append(
-            OutputData(outputs=outputs,
-                       seq_group_metadata_list=seq_group_metadata_list,
-                       scheduler_outputs=scheduler_outputs,
-                       is_async=is_async,
-                       is_last_step=is_last_step,
-                       is_first_step_output=is_first_step_output,
-                       skip=[]))
-
-
-class LLMEngine:
-    """An LLM engine that receives requests and generates texts.
-
-    This is the main class for the vLLM engine. It receives requests
-    from clients and generates texts from the LLM. It includes a tokenizer, a
-    language model (possibly distributed across multiple GPUs), and GPU memory
-    space allocated for intermediate states (aka KV cache). This class utilizes
-    iteration-level scheduling and efficient memory management to maximize the
-    serving throughput.
-
-    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
-    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
-    class wraps this class for online serving.
-
-    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
-
-    Args:
-        vllm_config: The configuration for initializing and running vLLM.
-        executor_class: The model executor class for managing distributed
-            execution.
-        log_stats: Whether to log statistics.
-        usage_context: Specified entry point, used for usage info collection.
-    """
-
-    DO_VALIDATE_OUTPUT: ClassVar[bool] = False
-    """A flag to toggle whether to validate the type of request output."""
-
-    @classmethod
-    @contextmanager
-    def enable_output_validation(cls):
-        cls.DO_VALIDATE_OUTPUT = True
-
-        yield
-
-        cls.DO_VALIDATE_OUTPUT = False
-
-    @classmethod
-    def validate_output(
-        cls,
-        output: object,
-        output_type: Type[_O],
-    ) -> _O:
-        do_validate = cls.DO_VALIDATE_OUTPUT
-
-        if ((TYPE_CHECKING or do_validate)
-                and not isinstance(output, output_type)):
-            raise TypeError(f"Expected output of type {output_type}, "
-                            f"but found type {type(output)}")
-
-        return cast(_O, output)
-
-    @classmethod
-    def validate_outputs(
-        cls,
-        outputs: GenericSequence[object],
-        output_type: Type[_O],
-    ) -> List[_O]:
-        do_validate = cls.DO_VALIDATE_OUTPUT
-
-        outputs_: List[_O]
-        if TYPE_CHECKING or do_validate:
-            outputs_ = []
-            for output in outputs:
-                if not isinstance(output, output_type):
-                    raise TypeError(f"Expected output of type {output_type}, "
-                                    f"but found type {type(output)}")
-
-                outputs_.append(output)
-        else:
-            outputs_ = outputs
-
-        return outputs_
-
-    tokenizer: Optional[TokenizerGroup]
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: Type[ExecutorBase],
-        log_stats: bool,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-        use_cached_outputs: bool = False,
-    ) -> None:
-        if envs.VLLM_USE_V1:
-            raise ValueError(
-                "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
-                "This should not happen. As a workaround, try using "
-                "LLMEngine.from_vllm_config(...) or explicitly set "
-                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
-
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config  # noqa
-        self.load_config = vllm_config.load_config
-        self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
-        )
-        self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
-        )
-
-        logger.info(
-            "Initializing a V0 LLM engine (v%s) with config: %s, "
-            "use_cached_outputs=%s, ",
-            VLLM_VERSION,
-            vllm_config,
-            use_cached_outputs,
-        )
-
-        self.log_stats = log_stats
-        self.use_cached_outputs = use_cached_outputs
-
-        if self.model_config.skip_tokenizer_init:
-            self.tokenizer = None
-            self.detokenizer = None
-            tokenizer_group = None
-        else:
-            self.tokenizer = self._init_tokenizer()
-            self.detokenizer = Detokenizer(self.tokenizer)
-            tokenizer_group = self.get_tokenizer_group()
-
-        # Ensure that the function doesn't contain a reference to self,
-        # to avoid engine GC issues
-        def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
-            assert tokenizer_group, ("tokenizer_group cannot be None, "
-                                     "make sure skip_tokenizer_init is False")
-            return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
-
-        self.seq_counter = Counter()
-        self.generation_config_fields = (
-            self.model_config.try_get_generation_config())
-
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
-
-        self.model_executor = executor_class(vllm_config=vllm_config)
-
-        if self.model_config.runner_type != "pooling":
-            self._initialize_kv_caches()
-
-        # If usage stat is enabled, collect relevant info.
-        if is_usage_stats_enabled():
-            from vllm.model_executor.model_loader import (
-                get_architecture_class_name)
-            usage_message.report_usage(
-                get_architecture_class_name(self.model_config),
-                usage_context,
-                extra_kvs={
-                    # Common configuration
-                    "dtype":
-                    str(self.model_config.dtype),
-                    "tensor_parallel_size":
-                    self.parallel_config.tensor_parallel_size,
-                    "block_size":
-                    self.cache_config.block_size,
-                    "gpu_memory_utilization":
-                    self.cache_config.gpu_memory_utilization,
-
-                    # Quantization
-                    "quantization":
-                    self.model_config.quantization,
-                    "kv_cache_dtype":
-                    str(self.cache_config.cache_dtype),
-
-                    # Feature flags
-                    "enable_lora":
-                    bool(self.lora_config),
-                    "enable_prefix_caching":
-                    self.cache_config.enable_prefix_caching,
-                    "enforce_eager":
-                    self.model_config.enforce_eager,
-                    "disable_custom_all_reduce":
-                    self.parallel_config.disable_custom_all_reduce,
-                })
-
-        self.cached_scheduler_outputs = [
-            SchedulerOutputState()
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        self.scheduler_contexts = [
-            SchedulerContext(multi_step_stream_outputs=self.scheduler_config.
-                             multi_step_stream_outputs)
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        if self.model_config.use_async_output_proc:
-            process_model_outputs = weak_bind(self._process_model_outputs)
-
-            self.async_callbacks = [
-                partial(process_model_outputs,
-                        ctx=self.scheduler_contexts[v_id])
-                for v_id in range(self.parallel_config.pipeline_parallel_size)
-            ]
-        else:
-            self.async_callbacks = []
-
-        # Currently used by AsyncLLMEngine to ensure quick append
-        # of request outputs to asyncio queues
-        self.process_request_outputs_callback: Optional[Callable] = None
-
-        # Create the scheduler.
-        # NOTE: the cache_config here have been updated with the numbers of
-        # GPU and CPU blocks, which are profiled in the distributed executor.
-        if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
-            Scheduler = resolve_obj_by_qualname(
-                self.vllm_config.scheduler_config.scheduler_cls)
-        else:
-            Scheduler = self.vllm_config.scheduler_config.scheduler_cls
-        self.scheduler = [
-            Scheduler(
-                self.scheduler_config, self.cache_config, self.lora_config,
-                self.parallel_config.pipeline_parallel_size,
-                self.async_callbacks[v_id]
-                if self.model_config.use_async_output_proc else None)
-            for v_id in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        # Metric Logging.
-        if self.log_stats:
-            if stat_loggers is not None:
-                self.stat_loggers = stat_loggers
-            else:
-                # Lazy import for prometheus multiprocessing.
-                # We need to set PROMETHEUS_MULTIPROC_DIR environment variable
-                # before prometheus_client is imported.
-                # See https://prometheus.github.io/client_python/multiprocess/
-                from vllm.engine.metrics import (LoggingStatLogger,
-                                                 PrometheusStatLogger)
-
-                self.stat_loggers = {
-                    "logging":
-                    LoggingStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        vllm_config=vllm_config),
-                    "prometheus":
-                    PrometheusStatLogger(
-                        local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
-                        labels=dict(
-                            model_name=self.model_config.served_model_name),
-                        vllm_config=vllm_config),
-                }
-                self.stat_loggers["prometheus"].info("cache_config",
-                                                     self.cache_config)
-
-        self.tracer = None
-        if self.observability_config.otlp_traces_endpoint:
-            self.tracer = init_tracer(
-                "vllm.llm_engine",
-                self.observability_config.otlp_traces_endpoint)
-
-        # Create sequence output processor, e.g. for beam search or
-        # speculative decoding.
-        self.output_processor = (
-            SequenceGroupOutputProcessor.create_output_processor(
-                self.scheduler_config,
-                self.detokenizer,
-                self.scheduler,
-                self.seq_counter,
-                get_tokenizer_for_seq,
-                stop_checker=StopChecker(self.scheduler_config.max_model_len,
-                                         get_tokenizer_for_seq),
-            ))
-
-        self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
-
-        # Flag to set when an input fails to process and the engine should run
-        # the next step without re-scheduling.
-        self._skip_scheduling_next_step = False
-
-        # Don't keep the dummy data in memory
-        self.reset_mm_cache()
-
-    def _initialize_kv_caches(self) -> None:
-        """Initialize the KV cache in the worker(s).
-
-        The workers will determine the number of blocks in both the GPU cache
-        and the swap CPU cache.
-        """
-        start = time.time()
-        num_gpu_blocks, num_cpu_blocks = (
-            self.model_executor.determine_num_available_blocks())
-
-        if self.cache_config.num_gpu_blocks_override is not None:
-            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
-            logger.info(
-                "Overriding num_gpu_blocks=%d with "
-                "num_gpu_blocks_override=%d", num_gpu_blocks,
-                num_gpu_blocks_override)
-            num_gpu_blocks = num_gpu_blocks_override
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-        elapsed = time.time() - start
-        logger.info(("init engine (profile, create kv cache, "
-                     "warmup model) took %.2f seconds"), elapsed)
-
-    @classmethod
-    def _get_executor_cls(cls,
-                          engine_config: VllmConfig) -> Type[ExecutorBase]:
-        # distributed_executor_backend must be set in VllmConfig.__post_init__
-        distributed_executor_backend = (
-            engine_config.parallel_config.distributed_executor_backend)
-        # Initialize the cluster and specify the executor class.
-        if isinstance(distributed_executor_backend, type):
-            if not issubclass(distributed_executor_backend, ExecutorBase):
-                raise TypeError(
-                    "distributed_executor_backend must be a subclass of "
-                    f"ExecutorBase. Got {distributed_executor_backend}.")
-            executor_class = distributed_executor_backend
-        elif distributed_executor_backend == "ray":
-            from vllm.executor.ray_distributed_executor import (
-                RayDistributedExecutor)
-            executor_class = RayDistributedExecutor
-        elif distributed_executor_backend == "mp":
-            from vllm.executor.mp_distributed_executor import (
-                MultiprocessingDistributedExecutor)
-            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-                "multiprocessing distributed executor backend does not "
-                "support VLLM_USE_RAY_SPMD_WORKER=1")
-            executor_class = MultiprocessingDistributedExecutor
-        elif distributed_executor_backend == "uni":
-            # JAX-style, single-process, multi-device executor.
-            from vllm.executor.uniproc_executor import UniProcExecutor
-            executor_class = UniProcExecutor
-        elif distributed_executor_backend == "external_launcher":
-            # executor with external launcher
-            from vllm.executor.uniproc_executor import (  # noqa
-                ExecutorWithExternalLauncher)
-            executor_class = ExecutorWithExternalLauncher
-        else:
-            raise ValueError("unrecognized distributed_executor_backend: "
-                             f"{distributed_executor_backend}")
-        return executor_class
-
-    @classmethod
-    def from_vllm_config(
-        cls,
-        vllm_config: VllmConfig,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-        disable_log_stats: bool = False,
-    ) -> "LLMEngine":
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=cls._get_executor_cls(vllm_config),
-            log_stats=(not disable_log_stats),
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-
-    @classmethod
-    def from_engine_args(
-        cls,
-        engine_args: EngineArgs,
-        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
-    ) -> "LLMEngine":
-        """Creates an LLM engine from the engine arguments."""
-        # Create the engine configs.
-        vllm_config = engine_args.create_engine_config(usage_context)
-
-        engine_cls = cls
-        if envs.VLLM_USE_V1:
-            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-            engine_cls = V1LLMEngine
-
-        return engine_cls.from_vllm_config(
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-            disable_log_stats=engine_args.disable_log_stats,
-        )
-
-    def __reduce__(self):
-        # This is to ensure that the LLMEngine is not referenced in
-        # the closure used to initialize Ray worker actors
-        raise RuntimeError("LLMEngine should not be pickled!")
-
-    def __del__(self):
-        # Shutdown model executor when engine is garbage collected
-        # Use getattr since __init__ can fail before the field is set
-        if model_executor := getattr(self, "model_executor", None):
-            model_executor.shutdown()
-
-    def get_tokenizer_group(self) -> TokenizerGroup:
-        if self.tokenizer is None:
-            raise ValueError("Unable to get tokenizer because "
-                             "skip_tokenizer_init is True")
-
-        return self.tokenizer
-
-    def get_tokenizer(
-        self,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> AnyTokenizer:
-        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)
-
-    def _init_tokenizer(self) -> TokenizerGroup:
-        return init_tokenizer_from_configs(
-            model_config=self.model_config,
-            scheduler_config=self.scheduler_config,
-            lora_config=self.lora_config)
-
-    def _verify_args(self) -> None:
-        self.model_config.verify_with_parallel_config(self.parallel_config)
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-        if self.lora_config:
-            self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
-
-    def _add_processed_request(
-        self,
-        request_id: str,
-        processed_inputs: ProcessorInputs,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> Optional[SequenceGroup]:
-        """Add a processed request to the engine's request pool.
-        return the created sequence group.
-        """
-        if isinstance(params, SamplingParams) and params.n > 1:
-            ParallelSampleSequenceGroup.add_request(
-                request_id,
-                self,
-                params,
-                processed_inputs=processed_inputs,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                priority=priority,
-            )
-            return None
-
-        self._validate_model_inputs(processed_inputs, lora_request)
-        # Create the sequences.
-        block_size = self.cache_config.block_size
-        seq_id = next(self.seq_counter)
-        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-
-        seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
-                       lora_request)
-
-        encoder_seq = (None if encoder_inputs is None else Sequence(
-            seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
-
-        # Create a SequenceGroup based on SamplingParams or PoolingParams
-        if isinstance(params, SamplingParams):
-            seq_group = self._create_sequence_group_with_sampling(
-                request_id,
-                seq,
-                params,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                trace_headers=trace_headers,
-                encoder_seq=encoder_seq,
-                priority=priority)
-        elif isinstance(params, PoolingParams):
-            seq_group = self._create_sequence_group_with_pooling(
-                request_id,
-                seq,
-                params,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                encoder_seq=encoder_seq,
-                priority=priority)
-        else:
-            raise ValueError(
-                "Either SamplingParams or PoolingParams must be provided.")
-
-        # Add the sequence group to the scheduler with least unfinished seqs.
-        costs = [
-            scheduler.get_num_unfinished_seq_groups()
-            for scheduler in self.scheduler
-        ]
-        min_cost_scheduler = self.scheduler[costs.index(min(costs))]
-        min_cost_scheduler.add_seq_group(seq_group)
-
-        return seq_group
-
-    def stop_remote_worker_execution_loop(self) -> None:
-        self.model_executor.stop_remote_worker_execution_loop()
-
-    def add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> None:
-        """Add a request to the engine's request pool.
-
-        The request is added to the request pool and will be processed by the
-        scheduler as `engine.step()` is called. The exact scheduling policy is
-        determined by the scheduler.
-
-        Args:
-            request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See
-                [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each input.
-            params: Parameters for sampling or pooling.
-                [SamplingParams][vllm.SamplingParams] for text generation.
-                [PoolingParams][vllm.PoolingParams] for pooling.
-            arrival_time: The arrival time of the request. If None, we use
-                the current monotonic time.
-            lora_request: The LoRA request to add.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Details:
-            - Set arrival_time to the current time if it is None.
-            - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of [Sequence][vllm.Sequence] objects.
-            - Create a [SequenceGroup][vllm.SequenceGroup] object
-              from the list of [Sequence][vllm.Sequence].
-            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
-              scheduler.
-
-        Example:
-            >>> # initialize engine
-            >>> engine = LLMEngine.from_engine_args(engine_args)
-            >>> # set request arguments
-            >>> example_prompt = "Who is the president of the United States?"
-            >>> sampling_params = SamplingParams(temperature=0.0)
-            >>> request_id = 0
-            >>>
-            >>> # add the request to the engine
-            >>> engine.add_request(
-            >>>    str(request_id),
-            >>>    example_prompt,
-            >>>    SamplingParams(temperature=0.0))
-            >>> # continue the request processing
-            >>> ...
-        """
-        if not isinstance(request_id, str):
-            raise TypeError(
-                f"request_id must be a string, got {type(request_id)}")
-
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-
-        if priority != 0 and not self.scheduler_config.policy == "priority":
-            raise ValueError(f"Got priority {priority} but "
-                             "Priority scheduling is not enabled.")
-
-        if isinstance(params, SamplingParams) \
-            and (params.guided_decoding or params.logits_processors) \
-            and self.scheduler_config.num_scheduler_steps > 1:
-            raise ValueError(
-                "Guided decoding and logits processors are not supported "
-                "in multi-step decoding")
-
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        if (isinstance(prompt, dict)
-                and prompt.get("prompt_embeds", None) is not None
-                and not prompt.get("prompt_token_ids", None)):
-            seq_len = prompt["prompt_embeds"].shape[0]
-            prompt["prompt_token_ids"] = [0] * seq_len
-
-        processed_inputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-            lora_request=lora_request,
-        )
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            priority=priority,
-        )
-
-    def _create_sequence_group_with_sampling(
-        self,
-        request_id: str,
-        seq: Sequence,
-        sampling_params: SamplingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        trace_headers: Optional[Mapping[str, str]] = None,
-        encoder_seq: Optional[Sequence] = None,
-        priority: int = 0,
-    ) -> SequenceGroup:
-        """Creates a SequenceGroup with SamplingParams."""
-        max_logprobs = self.get_model_config().max_logprobs
-        if (sampling_params.logprobs
-                and sampling_params.logprobs > max_logprobs) or (
-                    sampling_params.prompt_logprobs
-                    and sampling_params.prompt_logprobs > max_logprobs):
-            raise ValueError(f"Cannot request more than "
-                             f"{max_logprobs} logprobs.")
-
-        sampling_params = self._build_logits_processors(
-            sampling_params, lora_request)
-
-        # Defensive copy of SamplingParams, which are used by the sampler,
-        # this doesn't deep-copy LogitsProcessor objects
-        sampling_params = sampling_params.clone()
-
-        sampling_params.update_from_generation_config(
-            self.generation_config_fields, seq.eos_token_id)
-
-        # Create the sequence group.
-        draft_size = 1
-        if self.vllm_config.speculative_config is not None:
-            draft_size = \
-                self.vllm_config.speculative_config.num_speculative_tokens + 1
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  sampling_params=sampling_params,
-                                  lora_request=lora_request,
-                                  trace_headers=trace_headers,
-                                  encoder_seq=encoder_seq,
-                                  priority=priority,
-                                  draft_size=draft_size)
-
-        return seq_group
-
-    def _create_sequence_group_with_pooling(
-        self,
-        request_id: str,
-        seq: Sequence,
-        pooling_params: PoolingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        encoder_seq: Optional[Sequence] = None,
-        priority: int = 0,
-    ) -> SequenceGroup:
-        """Creates a SequenceGroup with PoolingParams."""
-        # Defensive copy of PoolingParams, which are used by the pooler
-        pooling_params = pooling_params.clone()
-        # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  lora_request=lora_request,
-                                  pooling_params=pooling_params,
-                                  encoder_seq=encoder_seq,
-                                  priority=priority)
-        return seq_group
-
-    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
-        """Aborts a request(s) with the given ID.
-
-        Args:
-            request_id: The ID(s) of the request to abort.
-
-        Details:
-            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
-
-        Example:
-            >>> # initialize engine and add a request with request_id
-            >>> request_id = str(0)
-            >>> # abort the request
-            >>> engine.abort_request(request_id)
-        """
-        for scheduler in self.scheduler:
-            scheduler.abort_seq_group(
-                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
-
-    def get_vllm_config(self) -> VllmConfig:
-        """Gets the vllm configuration."""
-        return self.vllm_config
-
-    def get_model_config(self) -> ModelConfig:
-        """Gets the model configuration."""
-        return self.model_config
-
-    def get_parallel_config(self) -> ParallelConfig:
-        """Gets the parallel configuration."""
-        return self.parallel_config
-
-    def get_decoding_config(self) -> DecodingConfig:
-        """Gets the decoding configuration."""
-        return self.decoding_config
-
-    def get_scheduler_config(self) -> SchedulerConfig:
-        """Gets the scheduler configuration."""
-        return self.scheduler_config
-
-    def get_lora_config(self) -> LoRAConfig:
-        """Gets the LoRA configuration."""
-        return self.lora_config
-
-    def get_num_unfinished_requests(self) -> int:
-        """Gets the number of unfinished requests."""
-        return sum(scheduler.get_num_unfinished_seq_groups()
-                   for scheduler in self.scheduler)
-
-    def has_unfinished_requests(self) -> bool:
-        """Returns True if there are unfinished requests."""
-        return any(scheduler.has_unfinished_seqs()
-                   for scheduler in self.scheduler)
-
-    def has_unfinished_requests_for_virtual_engine(
-            self, virtual_engine: int) -> bool:
-        """
-        Returns True if there are unfinished requests for the virtual engine.
-        """
-        return self.scheduler[virtual_engine].has_unfinished_seqs()
-
-    def reset_mm_cache(self) -> bool:
-        """Reset the multi-modal cache."""
-        return self.input_preprocessor.mm_registry.reset_processor_cache()
-
-    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
-        """Reset prefix cache for all devices."""
-
-        success = True
-        for scheduler in self.scheduler:
-            success = success and scheduler.reset_prefix_cache(device)
-        return success
-
-    @staticmethod
-    def _process_sequence_group_outputs(
-        seq_group: SequenceGroup,
-        outputs: List[PoolingSequenceGroupOutput],
-    ) -> None:
-        seq_group.pooled_data = outputs[0].data
-
-        for seq in seq_group.get_seqs():
-            seq.status = SequenceStatus.FINISHED_STOPPED
-
-        return
-
-    def _update_num_computed_tokens_for_multi_step_prefill(
-            self, seq_group: SequenceGroup,
-            seq_group_meta: SequenceGroupMetadata,
-            is_first_step_output: Optional[bool]):
-        """
-        This function updates num_computed_tokens for prompt sequences
-        when Multi-Step is enabled.
-
-        seq_group: SequenceGroup to update the num_computed_tokens for.
-        seq_group_meta: Metadata of the given SequenceGroup.
-        is_first_step_output: Optional[bool] -
-            When available, is_first_step_output indicates if the appended
-            output token is the output of the first-step in multi-step.
-            A value of None indicates that outputs from all steps in
-            in multi-step are submitted in a single burst.
-        """
-
-        assert self.scheduler_config.is_multi_step
-
-        if not seq_group_meta.is_prompt:
-            # num_computed_token updates for multi-step decodes happen after
-            # the tokens are appended to the sequence.
-            return
-
-        do_update: bool = False
-        if self.scheduler_config.chunked_prefill_enabled:
-            # In multi-step + chunked-prefill case, the prompt sequences
-            # that are scheduled are fully processed in the first step.
-            do_update = is_first_step_output is None or is_first_step_output
-        else:
-            # Normal multi-step decoding case. In this case prompt-sequences
-            # are actually single-stepped. Always update in this case.
-            assert seq_group.state.num_steps == 1
-            do_update = True
-
-        if do_update:
-            seq_group.update_num_computed_tokens(
-                seq_group_meta.token_chunk_size)
-
-    def _process_model_outputs(self,
-                               ctx: SchedulerContext,
-                               request_id: Optional[str] = None) -> None:
-        """Apply the model output to the sequences in the scheduled seq groups
-        and return responses.
-
-        ctx: The virtual engine context to work on
-        request_id: If provided, then only this request is going to be processed
-        """
-
-        now = time.time()
-
-        if len(ctx.output_queue) == 0:
-            return None
-
-        # Get pending async postprocessor
-        if request_id:
-            # When we process only one request, no pop is required
-            # (since later we will process all of the rest)
-            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
-             is_last_step, is_first_step_output, skip) = ctx.output_queue[0]
-        else:
-            (outputs, seq_group_metadata_list, scheduler_outputs, is_async,
-             is_last_step, is_first_step_output,
-             skip) = ctx.output_queue.popleft()
-
-        # Sanity check
-        assert len(seq_group_metadata_list) == len(
-            scheduler_outputs.scheduled_seq_groups)
-
-        has_multiple_outputs: bool = len(outputs) > 1
-        outputs_by_sequence_group: List[List[SequenceGroupOutput]]
-        if has_multiple_outputs:
-            assert self.scheduler_config.is_multi_step or \
-                     self.speculative_config
-            # Organize outputs by [step][sequence group] instead of
-            # [sequence group][step].
-            if self.scheduler_config.is_multi_step:
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    outputs, len(seq_group_metadata_list))
-            elif self.speculative_config:
-                # Decodes are multi-steps while prefills are not, outputting at
-                # most 1 token. Separate them so that we can trigger chunk
-                # processing without having to pad or copy over prompts K times
-                # to match decodes structure (costly with prompt_logprobs).
-                num_prefills = sum(sg.is_prompt
-                                   for sg in seq_group_metadata_list)
-                prefills, decodes = outputs[:num_prefills], outputs[
-                    num_prefills:]
-                outputs_by_sequence_group = create_output_by_sequence_group(
-                    decodes,
-                    num_seq_groups=len(seq_group_metadata_list) - num_prefills)
-                outputs_by_sequence_group = [p.outputs for p in prefills
-                                             ] + outputs_by_sequence_group
-            # We have outputs for multiple steps submitted in a single burst,
-            # so invalidate is_first_step_output.
-            is_first_step_output = None
-        else:
-            outputs_by_sequence_group = outputs
-
-        # Determine the requests we need to operate on
-        if request_id:
-            indices = []
-            for i, seq_group_meta in enumerate(seq_group_metadata_list):
-                if seq_group_meta.request_id == request_id:
-                    assert i not in skip  # Cannot be called twice
-                    indices.append(i)
-                    break
-
-            # If the request_id was not found, then it means that
-            # this is a new request that has no pending async
-            # postprocessor
-            if not indices:
-                return
-        else:
-            indices = range(len(seq_group_metadata_list))  # type: ignore
-
-        finished_before: List[int] = []
-        finished_now: List[int] = []
-        for i in indices:
-            if i in skip:
-                continue
-
-            seq_group_meta = seq_group_metadata_list[i]
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group: SequenceGroup = scheduled_seq_group.seq_group
-
-            if seq_group.is_finished():
-                finished_before.append(i)
-                continue
-
-            output: List[SequenceGroupOutput]
-            if has_multiple_outputs:
-                output = outputs_by_sequence_group[i]
-            else:
-                output = [outputs_by_sequence_group[0][i]]
-
-            if not is_async:
-                if self.scheduler_config.is_multi_step:
-                    # Updates happen only if the sequence is prefill
-                    self._update_num_computed_tokens_for_multi_step_prefill(
-                        seq_group, seq_group_meta, is_first_step_output)
-                else:
-                    seq_group.update_num_computed_tokens(
-                        seq_group_meta.token_chunk_size or 0)
-
-            if outputs:
-                for o in outputs:
-                    if (isinstance(o, SamplerOutput)
-                            and seq_group.metrics is not None):
-                        if seq_group.metrics.model_forward_time is not None:
-                            seq_group.metrics.model_forward_time += (
-                                o.model_forward_time or 0)
-                        else:
-                            seq_group.metrics.model_forward_time = (
-                                o.model_forward_time)
-                        if seq_group.metrics.model_execute_time is not None:
-                            seq_group.metrics.model_execute_time += (
-                                o.model_execute_time or 0)
-                        else:
-                            seq_group.metrics.model_execute_time = (
-                                o.model_execute_time)
-
-            if self.model_config.runner_type == "pooling":
-                self._process_sequence_group_outputs(seq_group, output)
-            else:
-                self.output_processor.process_prompt_logprob(seq_group, output)
-                if seq_group_meta.do_sample:
-                    self.output_processor.process_outputs(
-                        seq_group, output, is_async)
-
-            if seq_group.is_finished():
-                finished_now.append(i)
-
-        # Generate outputs for the requests that finished this iteration
-        for i in finished_now:
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group = scheduled_seq_group.seq_group
-            seq_group.maybe_set_first_token_time(now)
-            if not seq_group.is_prefill():
-                seq_group.set_last_token_time(now)
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs)
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # When we process a single request, we skip it for the next time,
-        # and invoke the request output callback (if there was final output)
-        if request_id:
-            assert len(indices) == 1
-            skip.append(indices[0])
-
-            if (finished_now
-                    and self.process_request_outputs_callback is not None):
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
-        # Free currently finished requests
-        if finished_now:
-            for scheduler in self.scheduler:
-                scheduler.free_finished_seq_groups()
-
-        # For multi-step without streaming, don't create outputs each iteration
-        if not is_last_step and not ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if (finished_now
-                    and self.process_request_outputs_callback is not None):
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
-        # Create the outputs
-        for i in indices:
-            if i in skip or i in finished_before or i in finished_now:
-                continue  # Avoids double processing
-
-            scheduled_seq_group = scheduler_outputs.scheduled_seq_groups[i]
-
-            seq_group = scheduled_seq_group.seq_group
-            seq_group.maybe_set_first_token_time(now)
-            if not seq_group.is_prefill():
-                seq_group.set_last_token_time(now)
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs)
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # For multi-step with streaming, create outputs each iteration
-        if not is_last_step and ctx.multi_step_stream_outputs:
-            # Immediately process request outputs here (if callback is given)
-            if self.process_request_outputs_callback is not None:
-                self.process_request_outputs_callback(ctx.request_outputs)
-                ctx.request_outputs.clear()
-            return
-
-        for seq_group in scheduler_outputs.ignored_seq_groups:
-            params = seq_group.sampling_params
-            if params is not None and params.output_kind == (
-                    RequestOutputKind.DELTA) and not seq_group.is_finished():
-                continue
-
-            request_output = RequestOutputFactory.create(
-                seq_group,
-                self.seq_id_to_seq_group,
-                use_cache=self.use_cached_outputs,
-            )
-            if request_output:
-                ctx.request_outputs.append(request_output)
-
-        # Immediately process request outputs here (if callback is given)
-        if (ctx.request_outputs
-                and self.process_request_outputs_callback is not None):
-            self.process_request_outputs_callback(ctx.request_outputs)
-            ctx.request_outputs.clear()
-
-        # For async case, we need to record the stats here.
-        # For non-async case, the stats are done in the
-        # LLMEngine/AsyncLLMEngine directly
-        if is_async:
-            # Log stats.
-            self.do_log_stats(scheduler_outputs, outputs, finished_before,
-                              skip)
-
-            # Tracing
-            self.do_tracing(scheduler_outputs, finished_before)
-
-        return None
-
-    def _advance_to_next_step(
-            self, output: SamplerOutput,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
-        """Given model output from a single run, append the tokens to the
-        sequences. This is normally done inside output processor, but it is
-        required if the worker is to perform async forward pass to next step.
-        """
-        for seq_group_metadata, sequence_group_outputs, scheduled_seq_group in \
-            zip(seq_group_metadata_list, output, scheduled_seq_groups):
-            seq_group = scheduled_seq_group.seq_group
-
-            if seq_group.is_finished():
-                continue
-
-            if self.scheduler_config.is_multi_step:
-                # Updates happen only if the sequence is prefill
-                self._update_num_computed_tokens_for_multi_step_prefill(
-                    seq_group, seq_group_metadata,
-                    seq_group.state.num_steps == 1)
-            else:
-                token_chunk_size = (seq_group_metadata.token_chunk_size
-                                    if seq_group_metadata.token_chunk_size
-                                    is not None else 0)
-                seq_group.update_num_computed_tokens(token_chunk_size)
-
-            if seq_group_metadata.do_sample:
-                assert len(sequence_group_outputs.samples) == 1, (
-                    "Async output processor expects a single sample"
-                    " (i.e sampling_params.n == 1)")
-                sample = sequence_group_outputs.samples[0]
-
-                assert len(seq_group.seqs) == 1
-                seq = seq_group.seqs[0]
-
-                if self.scheduler_config.is_multi_step:
-                    is_prefill_append = seq.data.get_num_uncomputed_tokens(
-                    ) == 0
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
-                    if not is_prefill_append:
-                        seq_group.update_num_computed_tokens(1)
-                else:
-                    seq.append_token_id(sample.output_token, sample.logprobs,
-                                        sample.output_embed)
-
-    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
-        """Performs one decoding iteration and returns newly generated results.
-
-        <figure markdown="span">
-        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
-        <figcaption>Overview of the step function</figcaption>
-        </figure>
-
-        Details:
-        - Step 1: Schedules the sequences to be executed in the next
-            iteration and the token blocks to be swapped in/out/copy.
-
-            - Depending on the scheduling policy,
-                sequences may be `preempted/reordered`.
-            - A Sequence Group (SG) refer to a group of sequences
-                that are generated from the same prompt.
-
-        - Step 2: Calls the distributed executor to execute the model.
-        - Step 3: Processes the model output. This mainly includes:
-
-            - Decodes the relevant outputs.
-            - Updates the scheduled sequence groups with model outputs
-                based on its `sampling parameters` (`use_beam_search` or not).
-            - Frees the finished sequence groups.
-
-        - Finally, it creates and returns the newly generated results.
-
-        Example:
-        ```
-        # Please see the example/ folder for more detailed examples.
-
-        # initialize engine and request arguments
-        engine = LLMEngine.from_engine_args(engine_args)
-        example_inputs = [(0, "What is LLM?",
-        SamplingParams(temperature=0.0))]
-    
-        # Start the engine with an event loop
-        while True:
-            if example_inputs:
-                req_id, prompt, sampling_params = example_inputs.pop(0)
-                engine.add_request(str(req_id),prompt,sampling_params)
-
-            # continue the request processing
-            request_outputs = engine.step()
-            for request_output in request_outputs:
-                if request_output.finished:
-                    # return or show the request output
-
-            if not (engine.has_unfinished_requests() or example_inputs):
-                break
-        ```
-        """
-        if self.parallel_config.pipeline_parallel_size > 1:
-            raise NotImplementedError(
-                "Pipeline parallelism is only supported through AsyncLLMEngine "
-                "as performance will be severely degraded otherwise.")
-
-        # For llm_engine, there is no pipeline parallel support, so the engine
-        # used is always 0.
-        virtual_engine = 0
-
-        # These are cached outputs from previous iterations. None if on first
-        # iteration
-        cached_outputs = self.cached_scheduler_outputs[virtual_engine]
-        seq_group_metadata_list = cached_outputs.seq_group_metadata_list
-        scheduler_outputs = cached_outputs.scheduler_outputs
-        allow_async_output_proc = cached_outputs.allow_async_output_proc
-
-        ctx = self.scheduler_contexts[virtual_engine]
-
-        # Clear outputs for each new scheduler iteration
-        ctx.request_outputs.clear()
-
-        # Skip the scheduler if there are any remaining steps in the seq groups.
-        # This ensures that the scheduler is only called again when the current
-        # batch has completed.
-        # The scheduler is also skipped if a single request caused the last
-        # engine step to fail, and the previous schedule needs to be rerun.
-        if not self._has_remaining_steps(
-                seq_group_metadata_list
-        ) and not self._skip_scheduling_next_step:
-            # Schedule iteration
-            (seq_group_metadata_list, scheduler_outputs,
-             allow_async_output_proc
-             ) = self.scheduler[virtual_engine].schedule()
-
-            ctx.seq_group_metadata_list = seq_group_metadata_list
-            ctx.scheduler_outputs = scheduler_outputs
-
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
-            # When n>1, elements in self.seq_id_to_seq_group should be deleted
-            # here, otherwise memory leaks.
-            for finished_request_id in finished_requests_ids:
-                if finished_request_id in self.seq_id_to_seq_group:
-                    del self.seq_id_to_seq_group[finished_request_id]
-
-            # Maybe switch from async mode to sync mode
-            if not allow_async_output_proc and len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-
-            if (self.scheduler_config.is_multi_step
-                    and scheduler_outputs.num_lookahead_slots > 0):
-                # cache the scheduler outputs for the next iteration if we have
-                # lookahead slots
-                self._cache_scheduler_outputs_for_multi_step(
-                    virtual_engine, seq_group_metadata_list, scheduler_outputs,
-                    allow_async_output_proc)
-        else:
-            finished_requests_ids = list()
-
-        assert seq_group_metadata_list is not None
-        assert scheduler_outputs is not None
-
-        if not scheduler_outputs.is_empty():
-
-            # Check if we have a cached last_output from the previous iteration.
-            # For supporting PP this is probably the best way to pass the
-            # sampled_token_ids, as a separate broadcast over all the PP stages
-            # will cause one virtual engine's microbatch to block the pipeline.
-            last_sampled_token_ids = \
-                self._get_last_sampled_token_ids(virtual_engine)
-
-            execute_model_req = ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list,
-                blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
-                blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
-                blocks_to_copy=scheduler_outputs.blocks_to_copy,
-                num_lookahead_slots=scheduler_outputs.num_lookahead_slots,
-                running_queue_size=scheduler_outputs.running_queue_size,
-                finished_requests_ids=finished_requests_ids,
-                # We use ExecuteModelRequest to pass the last sampled_token_ids
-                # to each of the non-last PP stages for in-place prepare_input.
-                last_sampled_token_ids=last_sampled_token_ids)
-
-            if allow_async_output_proc:
-                execute_model_req.async_callback = self.async_callbacks[
-                    virtual_engine]
-
-            try:
-                outputs = self.model_executor.execute_model(
-                    execute_model_req=execute_model_req)
-                self._skip_scheduling_next_step = False
-            except InputProcessingError as e:
-                # The input for this request cannot be processed, so we must
-                # abort it. If there are remaining requests in the batch that
-                # have been scheduled, they will be retried on the next step.
-                invalid_request_id = e.request_id
-                self._abort_and_cache_schedule(
-                    request_id=invalid_request_id,
-                    virtual_engine=virtual_engine,
-                    seq_group_metadata_list=seq_group_metadata_list,
-                    scheduler_outputs=scheduler_outputs,
-                    allow_async_output_proc=allow_async_output_proc)
-                # Raise so the caller is notified that this request failed
-                raise
-
-            # We need to do this here so that last step's sampled_token_ids can
-            # be passed to the next iteration for PP.
-            if self.scheduler_config.is_multi_step:
-                self._update_cached_scheduler_output(virtual_engine, outputs)
-        else:
-            # Nothing scheduled => If there is pending async postprocessor,
-            # then finish it here.
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            # No outputs in this case
-            outputs = []
-
-        # Finish the current step for all the sequence groups.
-        if self.scheduler_config.is_multi_step:
-            for seq_group in seq_group_metadata_list:
-                seq_group.finish_step()
-
-        if not self._has_remaining_steps(seq_group_metadata_list):
-            # clear the cache if we have finished all the steps.
-            if self.scheduler_config.is_multi_step:
-                self.cached_scheduler_outputs[0] = SchedulerOutputState()
-
-            # is_first_step_output is True only when the num_steps of all
-            # the sequences are 1. When the num_steps > 1,
-            # multi_step_model_runner does the first-step output append.
-            is_first_step_output: bool = False if not seq_group_metadata_list \
-                else seq_group_metadata_list[0].state.num_steps == 1
-
-            # Add results to the output_queue
-            ctx.append_output(outputs=outputs,
-                              seq_group_metadata_list=seq_group_metadata_list,
-                              scheduler_outputs=scheduler_outputs,
-                              is_async=allow_async_output_proc,
-                              is_last_step=True,
-                              is_first_step_output=is_first_step_output)
-
-            if outputs and allow_async_output_proc:
-                assert len(outputs) == 1, (
-                    "Async postprocessor expects only a single output set")
-
-                self._advance_to_next_step(
-                    outputs[0], seq_group_metadata_list,
-                    scheduler_outputs.scheduled_seq_groups)
-
-            # Check if need to run the usual non-async path
-            if not allow_async_output_proc:
-                self._process_model_outputs(ctx=ctx)
-
-                # Log stats.
-                self.do_log_stats(scheduler_outputs, outputs)
-
-                # Tracing
-                self.do_tracing(scheduler_outputs)
-        else:
-            # Multi-step case
-            return ctx.request_outputs
-
-        if not self.has_unfinished_requests():
-            # Drain async postprocessor (if exists)
-            if len(ctx.output_queue) > 0:
-                self._process_model_outputs(ctx=ctx)
-            assert len(ctx.output_queue) == 0
-
-            # Stop the execute model loop in parallel workers until there are
-            # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
-            # the RPC thread in the workers so that they can process any other
-            # queued control plane messages, such as add/remove lora adapters.
-            logger.debug("Stopping remote worker execution loop.")
-            self.model_executor.stop_remote_worker_execution_loop()
-
-        return ctx.request_outputs
-
-    def _abort_and_cache_schedule(
-            self, request_id: str, virtual_engine: int,
-            seq_group_metadata_list: List[SequenceGroupMetadata],
-            scheduler_outputs: SchedulerOutputs,
-            allow_async_output_proc: bool) -> None:
-        """Aborts a single request, and caches the scheduler outputs minus that
-        request. This allows the next step to continue processing the remaining
-        requests without having to re-run the scheduler."""
-
-        # Abort the request and remove its sequence group from the current
-        # schedule
-        self.abort_request(request_id)
-        for i, metadata in enumerate(seq_group_metadata_list):
-            if metadata.request_id == request_id:
-                del seq_group_metadata_list[i]
-                break
-        for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
-            if group.seq_group.request_id == request_id:
-                del scheduler_outputs.scheduled_seq_groups[i]
-                break
-
-        # If there are still other sequence groups left in the schedule, cache
-        # them and flag the engine to reuse the schedule.
-        if len(seq_group_metadata_list) > 0:
-            self._skip_scheduling_next_step = True
-            # Reuse multi-step caching logic
-            self._cache_scheduler_outputs_for_multi_step(
-                virtual_engine=virtual_engine,
-                scheduler_outputs=scheduler_outputs,
-                seq_group_metadata_list=seq_group_metadata_list,
-                allow_async_output_proc=allow_async_output_proc)
-
-    def _has_remaining_steps(
-        self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
-    ) -> bool:
-        if (not self.scheduler_config.is_multi_step
-                or not seq_group_metadata_list):
-            return False
-
-        # TODO(will) this is a sanity check for nowto make sure that all the
-        # seqs are on the same steps. Eventually we will want to do some sort of
-        # dynamic scheduling when doing multi-step decoding.
-        ref_remaining_steps = seq_group_metadata_list[0].state.remaining_steps
-        if any([
-                seq_group.state.remaining_steps != ref_remaining_steps
-                for seq_group in seq_group_metadata_list[1:]
-        ]):
-            raise AssertionError("All running sequence groups should "
-                                 "have the same remaining steps.")
-
-        return ref_remaining_steps > 0
-
-    def _cache_scheduler_outputs_for_multi_step(
-            self, virtual_engine: int,
-            seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-            scheduler_outputs: SchedulerOutputs,
-            allow_async_output_proc: bool) -> None:
-        co = self.cached_scheduler_outputs[virtual_engine]
-
-        co.seq_group_metadata_list = seq_group_metadata_list
-        co.scheduler_outputs = scheduler_outputs
-        co.allow_async_output_proc = allow_async_output_proc
-        co.last_output = None
-
-    def _update_cached_scheduler_output(
-            self, virtual_engine: int,
-            output: List[Optional[SamplerOutput]]) -> None:
-        if (self.parallel_config.pipeline_parallel_size > 1 and len(output) > 0
-                and output[0] is not None):
-            last_output = output[-1]
-            assert last_output is not None
-            assert last_output.sampled_token_ids_cpu is not None
-            assert last_output.sampled_token_ids is None
-            assert last_output.sampled_token_probs is None
-            self.cached_scheduler_outputs[
-                virtual_engine].last_output = last_output
-
-    def _get_last_sampled_token_ids(
-            self, virtual_engine: int) -> Optional[torch.Tensor]:
-        cached_last_output = self.cached_scheduler_outputs[
-            virtual_engine].last_output
-        if (self.scheduler_config.is_multi_step
-                and self.parallel_config.pipeline_parallel_size > 1
-                and cached_last_output is not None
-                and cached_last_output.sampled_token_ids_cpu is not None):
-            return cached_last_output.sampled_token_ids_cpu
-        return None
-
-    def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None:
-        if not self.log_stats:
-            raise RuntimeError(
-                "Stat logging is disabled. Set `disable_log_stats=False` "
-                "argument to enable.")
-        if logger_name in self.stat_loggers:
-            raise KeyError(f"Logger with name {logger_name} already exists.")
-        self.stat_loggers[logger_name] = logger
-
-    def remove_logger(self, logger_name: str) -> None:
-        if not self.log_stats:
-            raise RuntimeError(
-                "Stat logging is disabled. Set `disable_log_stats=False` "
-                "argument to enable.")
-        if logger_name not in self.stat_loggers:
-            raise KeyError(f"Logger with name {logger_name} does not exist.")
-        del self.stat_loggers[logger_name]
-
-    def do_log_stats(self,
-                     scheduler_outputs: Optional[SchedulerOutputs] = None,
-                     model_output: Optional[List[SamplerOutput]] = None,
-                     finished_before: Optional[List[int]] = None,
-                     skip: Optional[List[int]] = None) -> None:
-        """Forced log when no requests active."""
-        if self.log_stats:
-            stats = self._get_stats(scheduler_outputs, model_output,
-                                    finished_before, skip)
-            for logger in self.stat_loggers.values():
-                logger.log(stats)
-
-    def _get_stats(self,
-                   scheduler_outputs: Optional[SchedulerOutputs],
-                   model_output: Optional[List[SamplerOutput]] = None,
-                   finished_before: Optional[List[int]] = None,
-                   skip: Optional[List[int]] = None) -> Stats:
-        """Get Stats to be Logged to Prometheus.
-
-        Args:
-            scheduler_outputs: Optional, used to populate metrics related to
-                the scheduled batch,
-            model_output: Optional, used to emit speculative decoding metrics
-                which are created by the workers.
-            finished_before: Optional, indices of sequences that were finished
-                before. These sequences will be ignored.
-            skip: Optional, indices of sequences that were preempted. These
-                sequences will be ignored.
-        """
-        now = time.time()
-
-        # System State
-        #   Scheduler State
-        num_running_sys = sum(
-            len(scheduler.running) for scheduler in self.scheduler)
-        num_swapped_sys = sum(
-            len(scheduler.swapped) for scheduler in self.scheduler)
-        num_waiting_sys = sum(
-            len(scheduler.waiting) for scheduler in self.scheduler)
-
-        # KV Cache Usage in %
-        num_total_gpu = self.cache_config.num_gpu_blocks
-        gpu_cache_usage_sys = 0.
-        if num_total_gpu:  # Guard against both None and 0
-            num_free_gpu = sum(
-                scheduler.block_manager.get_num_free_gpu_blocks()
-                for scheduler in self.scheduler)
-            gpu_cache_usage_sys = 1.0 - (num_free_gpu / num_total_gpu)
-
-        num_total_cpu = self.cache_config.num_cpu_blocks
-        cpu_cache_usage_sys = 0.
-        if num_total_cpu:  # Guard against both None and 0
-            num_free_cpu = sum(
-                scheduler.block_manager.get_num_free_cpu_blocks()
-                for scheduler in self.scheduler)
-            cpu_cache_usage_sys = 1.0 - (num_free_cpu / num_total_cpu)
-
-        # Prefix Cache Hit Rate. Note that we always use
-        # the cache hit rate of the first virtual engine.
-        cpu_prefix_cache_hit_rate = self.scheduler[
-            0].get_prefix_cache_hit_rate(Device.CPU)
-        gpu_prefix_cache_hit_rate = self.scheduler[
-            0].get_prefix_cache_hit_rate(Device.GPU)
-
-        # Exchange the uasge and cache hit stats between gpu and cpu when
-        # running on cpu because the cpu_worker.py intentionally reports the
-        # number of cpu blocks as gpu blocks in favor of cache management.
-        if self.device_config.device_type == "cpu":
-            num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
-            gpu_cache_usage_sys, cpu_cache_usage_sys = (
-                cpu_cache_usage_sys,
-                gpu_cache_usage_sys,
-            )
-            gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
-                cpu_prefix_cache_hit_rate,
-                gpu_prefix_cache_hit_rate,
-            )
-
-        # Iteration stats
-        num_prompt_tokens_iter = 0
-        num_generation_tokens_iter = 0
-        num_tokens_iter = 0
-        time_to_first_tokens_iter: List[float] = []
-        time_per_output_tokens_iter: List[float] = []
-        num_preemption_iter = (0 if scheduler_outputs is None else
-                               scheduler_outputs.preempted)
-
-        # Request stats
-        #   Latency
-        time_e2e_requests: List[float] = []
-        time_queue_requests: List[float] = []
-        time_inference_requests: List[float] = []
-        time_prefill_requests: List[float] = []
-        time_decode_requests: List[float] = []
-        #   Metadata
-        num_prompt_tokens_requests: List[int] = []
-        num_generation_tokens_requests: List[int] = []
-        n_requests: List[int] = []
-        max_num_generation_tokens_requests: List[int] = []
-        max_tokens_requests: List[int] = []
-        finished_reason_requests: List[str] = []
-
-        # LoRA requests
-        running_lora_adapters = dict(
-            collectionsCounter([
-                running_request.lora_request.lora_name
-                for scheduler in self.scheduler
-                for running_request in scheduler.running
-                if running_request.lora_request
-            ]))
-        waiting_lora_adapters = dict(
-            collectionsCounter([
-                waiting_request.lora_request.lora_name
-                for scheduler in self.scheduler
-                for waiting_request in scheduler.waiting
-                if waiting_request.lora_request
-            ]))
-        max_lora_stat = "0"
-        if self.lora_config:
-            max_lora_stat = str(self.lora_config.max_loras)
-
-        # NOTE: This loop assumes prefill seq_groups are before
-        # decode seq_groups in scheduled_seq_groups.
-        if scheduler_outputs is not None:
-            # For async postprocessor, already finished sequences need to be
-            # not counted (to avoid double counting)
-            actual_num_batched_tokens = scheduler_outputs.num_batched_tokens  # type: ignore
-
-            num_generation_tokens_from_prefill_groups = 0
-            # NOTE: if scheduler_outputs.num_prefill_groups > 0 and
-            # the len of scheduler_outputs.scheduled_seq_groups is !=
-            # scheduler_outputs.num_prefill_groups, this means that
-            # chunked prefills have been detected.
-
-            for idx, scheduled_seq_group in enumerate(
-                    scheduler_outputs.scheduled_seq_groups):
-                # Skip double logging when using async output proc
-                if finished_before and idx in finished_before:
-                    actual_num_batched_tokens -= 1
-                    continue
-
-                # Currently, skip == preempted sequences, so we need to skip
-                # their log stats
-                if skip and idx in skip:
-                    continue
-
-                group_was_prefill = idx < scheduler_outputs.num_prefill_groups
-                seq_group = scheduled_seq_group.seq_group
-
-                # NOTE: a seq_group that completed all of its prefill tokens
-                # in the last iteration will have seq_group.is_prefill() = False
-                # with group_was_prefill = True
-                if group_was_prefill:
-                    # Number of prompt tokens.
-                    num_prompt_tokens_iter += (
-                        scheduled_seq_group.token_chunk_size)
-
-                    # If the seq_group just finished the prefill state
-                    # get TTFT.
-                    if not seq_group.is_prefill():
-                        latency = seq_group.get_last_token_latency()
-                        time_to_first_tokens_iter.append(latency)
-
-                        # One generation token per finished prefill.
-                        num_generation_tokens_from_prefill_groups += (
-                            seq_group.num_seqs())
-                else:
-                    # TPOTs.
-                    latency = seq_group.get_last_token_latency()
-                    time_per_output_tokens_iter.append(latency)
-                    if seq_group.state.current_step == 0:
-                        # For async_output_proc, the do_log_stats()
-                        # is called following init_multi_step(), which
-                        # sets the current_step to zero.
-                        actual_num_batched_tokens +=\
-                            seq_group.state.num_steps - 1
-                    else:
-                        actual_num_batched_tokens +=\
-                            seq_group.state.current_step - 1
-
-                # Because of chunked prefill, we can have a single sequence
-                # group that does multiple prompt_runs. To prevent logging
-                # the same metadata more than once per request, we standardize
-                # on logging request level information for finished requests,
-                # which can only happen once.
-                if seq_group.is_finished():
-                    # Latency timings
-                    time_e2e_requests.append(now -
-                                             seq_group.metrics.arrival_time)
-                    if (seq_group.metrics.first_scheduled_time is not None and
-                            seq_group.metrics.first_token_time is not None):
-                        time_queue_requests.append(
-                            seq_group.metrics.first_scheduled_time -
-                            seq_group.metrics.arrival_time)
-                        time_prefill_requests.append(
-                            seq_group.metrics.first_token_time -
-                            seq_group.metrics.first_scheduled_time)
-                        time_decode_requests.append(
-                            now - seq_group.metrics.first_token_time)
-                        time_inference_requests.append(
-                            now - seq_group.metrics.first_scheduled_time)
-                    # Metadata
-                    num_prompt_tokens_requests.append(
-                        len(seq_group.prompt_token_ids))
-                    num_generation_tokens_requests.extend([
-                        seq.get_output_len()
-                        for seq in seq_group.get_finished_seqs()
-                    ])
-                    max_num_generation_tokens_requests.append(
-                        max(seq.get_output_len()
-                            for seq in seq_group.get_seqs()))
-                    if seq_group.sampling_params is not None:
-                        n_requests.append(seq_group.sampling_params.n)
-                        max_tokens_requests.append(
-                            seq_group.sampling_params.max_tokens)
-                    finished_reason_requests.extend([
-                        SequenceStatus.get_finished_reason(seq.status)
-                        for seq in seq_group.get_finished_seqs()
-                    ])
-
-            # Number of generation tokens.
-            #   num_batched_tokens equals the number of prompt_tokens plus the
-            #   number of decode_tokens in a single iteration. So,
-            #   num_generation_tokens = num_batched_tokens - num_prompt_tokens
-            #   + num_generation_tokens_from_prefill_groups (since we generate
-            #   one token on prefills on iters where the prefill finishes).
-            num_generation_tokens_iter = (
-                actual_num_batched_tokens - num_prompt_tokens_iter +
-                num_generation_tokens_from_prefill_groups)
-            num_tokens_iter = (num_generation_tokens_iter +
-                               num_prompt_tokens_iter)
-
-        return Stats(
-            now=now,
-            # System stats
-            #   Scheduler State
-            num_running_sys=num_running_sys,
-            num_swapped_sys=num_swapped_sys,
-            num_waiting_sys=num_waiting_sys,
-            #   KV Cache Usage in %
-            gpu_cache_usage_sys=gpu_cache_usage_sys,
-            cpu_cache_usage_sys=cpu_cache_usage_sys,
-            #   Prefix Cache Hit Rate
-            cpu_prefix_cache_hit_rate=cpu_prefix_cache_hit_rate,
-            gpu_prefix_cache_hit_rate=gpu_prefix_cache_hit_rate,
-
-            # Iteration stats
-            num_prompt_tokens_iter=num_prompt_tokens_iter,
-            num_generation_tokens_iter=num_generation_tokens_iter,
-            num_tokens_iter=num_tokens_iter,
-            time_to_first_tokens_iter=time_to_first_tokens_iter,
-            time_per_output_tokens_iter=time_per_output_tokens_iter,
-            num_preemption_iter=num_preemption_iter,
-
-            # Request stats
-            #   Latency
-            time_e2e_requests=time_e2e_requests,
-            time_queue_requests=time_queue_requests,
-            time_inference_requests=time_inference_requests,
-            time_prefill_requests=time_prefill_requests,
-            time_decode_requests=time_decode_requests,
-            #   Metadata
-            num_prompt_tokens_requests=num_prompt_tokens_requests,
-            num_generation_tokens_requests=num_generation_tokens_requests,
-            max_num_generation_tokens_requests=
-            max_num_generation_tokens_requests,
-            n_requests=n_requests,
-            max_tokens_requests=max_tokens_requests,
-            finished_reason_requests=finished_reason_requests,
-            max_lora=str(max_lora_stat),
-            waiting_lora_adapters=list(waiting_lora_adapters.keys()),
-            running_lora_adapters=list(running_lora_adapters.keys()))
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_executor.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_executor.remove_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.model_executor.list_loras()
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_executor.pin_lora(lora_id)
-
-    def start_profile(self) -> None:
-        self.model_executor.start_profile()
-
-    def stop_profile(self) -> None:
-        self.model_executor.stop_profile()
-
-    def sleep(self, level: int = 1) -> None:
-        assert self.vllm_config.model_config.enable_sleep_mode, (
-            "Sleep mode is not enabled in the model config")
-        self.model_executor.sleep(level=level)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        assert self.vllm_config.model_config.enable_sleep_mode, (
-            "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up(tags)
-
-    def is_sleeping(self) -> bool:
-        return self.model_executor.is_sleeping
-
-    def check_health(self) -> None:
-        self.model_executor.check_health()
-
-    def is_tracing_enabled(self) -> bool:
-        return self.tracer is not None
-
-    def do_tracing(self,
-                   scheduler_outputs: SchedulerOutputs,
-                   finished_before: Optional[List[int]] = None) -> None:
-        if self.tracer is None:
-            return
-
-        for idx, scheduled_seq_group in enumerate(
-                scheduler_outputs.scheduled_seq_groups):
-            # Skip double tracing when using async output proc
-            if finished_before and idx in finished_before:
-                continue
-
-            seq_group = scheduled_seq_group.seq_group
-            if seq_group.is_finished():
-                self.create_trace_span(seq_group)
-
-    def create_trace_span(self, seq_group: SequenceGroup) -> None:
-        if self.tracer is None or seq_group.sampling_params is None:
-            return
-        arrival_time_nano_seconds = int(seq_group.metrics.arrival_time * 1e9)
-
-        trace_context = extract_trace_context(seq_group.trace_headers)
-
-        with self.tracer.start_as_current_span(
-                "llm_request",
-                kind=SpanKind.SERVER,
-                context=trace_context,
-                start_time=arrival_time_nano_seconds) as seq_span:
-            metrics = seq_group.metrics
-            ttft = metrics.first_token_time - metrics.arrival_time
-            e2e_time = metrics.finished_time - metrics.arrival_time
-            seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL,
-                                   self.model_config.model)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID,
-                                   seq_group.request_id)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
-                                   seq_group.sampling_params.temperature)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P,
-                                   seq_group.sampling_params.top_p)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
-                                   seq_group.sampling_params.max_tokens)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N,
-                                   seq_group.sampling_params.n)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES,
-                                   seq_group.num_seqs())
-            seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
-                                   len(seq_group.prompt_token_ids))
-            seq_span.set_attribute(
-                SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
-                sum([
-                    seq.get_output_len()
-                    for seq in seq_group.get_finished_seqs()
-                ]))
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
-                                   metrics.time_in_queue)
-            seq_span.set_attribute(
-                SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft)
-            seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time)
-            if metrics.scheduler_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER,
-                    metrics.scheduler_time)
-            if metrics.model_forward_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD,
-                    metrics.model_forward_time / 1000.0)
-            if metrics.model_execute_time is not None:
-                seq_span.set_attribute(
-                    SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE,
-                    metrics.model_execute_time)
-
-    def _validate_model_inputs(self, inputs: ProcessorInputs,
-                               lora_request: Optional[LoRARequest]):
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
-
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs,
-                                       lora_request,
-                                       prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs,
-                                   lora_request,
-                                   prompt_type="decoder")
-
-    def _validate_model_input(
-        self,
-        prompt_inputs: SingletonInputs,
-        lora_request: Optional[LoRARequest],
-        *,
-        prompt_type: Literal["encoder", "decoder"],
-    ):
-        model_config = self.model_config
-        tokenizer = (None if self.tokenizer is None else
-                     self.tokenizer.get_lora_tokenizer(lora_request))
-
-        prompt_ids = prompt_inputs.get("prompt_token_ids", [])
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids, default=0)
-            if max_input_id > tokenizer.max_token_id:
-                raise ValueError(
-                    f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
-        if len(prompt_ids) > max_prompt_len:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config,
-                    tokenizer=tokenizer or object(),  # Dummy if no tokenizer
-                )
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
-            if model_config.is_multimodal_model:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well.")
-            else:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens.")
-
-            raise ValueError(
-                f"The {prompt_type} prompt (length {len(prompt_ids)}) is "
-                f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}")
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
-
-    def _build_logits_processors(
-            self, sampling_params: SamplingParams,
-            lora_request: Optional[LoRARequest]) -> SamplingParams:
-        """Constructs logits processors based on the guided_decoding,
-        logits_bias, and allowed_token_ids fields in sampling_params. Deletes
-        those fields and adds the constructed logits processors to the
-        logits_processors field. Returns the modified sampling params."""
-
-        logits_processors = []
-
-        if sampling_params.guided_decoding is not None:
-            # Defensively copy sampling params since guided decoding logits
-            # processors can have different state for each request
-            sampling_params = copy.copy(sampling_params)
-            guided_decoding = sampling_params.guided_decoding
-
-            logger.debug(
-                "Building guided decoding logits processor in "
-                "LLMEngine. Params: %s", guided_decoding)
-
-            tokenizer = self.get_tokenizer(lora_request=lora_request)
-            guided_decoding.backend = guided_decoding.backend or \
-                self.decoding_config.backend
-
-            if self.decoding_config.reasoning_backend:
-                logger.debug("Building with reasoning backend %s",
-                             self.decoding_config.reasoning_backend)
-
-            processor = get_local_guided_decoding_logits_processor(
-                guided_params=guided_decoding,
-                tokenizer=tokenizer,
-                model_config=self.model_config,
-                reasoning_backend=self.decoding_config.reasoning_backend,
-            )
-            if processor:
-                logits_processors.append(processor)
-
-            # Unset so this doesn't get passed down to the model
-            sampling_params.guided_decoding = None
-
-        if (sampling_params.logit_bias or sampling_params.allowed_token_ids):
-            tokenizer = self.get_tokenizer(lora_request=lora_request)
-
-            processors = get_openai_logits_processors(
-                logit_bias=sampling_params.logit_bias,
-                allowed_token_ids=sampling_params.allowed_token_ids,
-                tokenizer=tokenizer)
-            logits_processors.extend(processors)
-
-            # Unset so these don't get passed down to the model
-            sampling_params.logit_bias = None
-            sampling_params.allowed_token_ids = None
-
-        if len(sampling_params.bad_words) > 0:
-            tokenizer = self.get_tokenizer(lora_request)
-            processors = get_bad_words_logits_processors(
-                bad_words=sampling_params.bad_words, tokenizer=tokenizer)
-            logits_processors.extend(processors)
-
-        if logits_processors:
-            if sampling_params.logits_processors is None:
-                sampling_params.logits_processors = logits_processors
-            else:
-                sampling_params.logits_processors.extend(logits_processors)
-
-        return sampling_params
-
-    def collective_rpc(self,
-                       method: Union[str, Callable[..., _R]],
-                       timeout: Optional[float] = None,
-                       args: tuple = (),
-                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
-        return self.model_executor.collective_rpc(method, timeout, args,
-                                                  kwargs)
-
-
-if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
-    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-    LLMEngine = V1LLMEngine  # type: ignore
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
deleted file mode 100644
index ba8dbd1fad79..000000000000
--- a/vllm/engine/metrics.py
+++ /dev/null
@@ -1,563 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import time
-from typing import Counter as CollectionsCounter
-from typing import Dict, List, Optional, Type, Union, cast
-
-import numpy as np
-import prometheus_client
-
-from vllm.config import SupportsMetricsInfo, VllmConfig
-from vllm.engine.metrics_types import StatLoggerBase, Stats
-from vllm.executor.ray_utils import ray
-from vllm.logger import init_logger
-
-if ray is not None:
-    from ray.util import metrics as ray_metrics
-else:
-    ray_metrics = None
-
-logger = init_logger(__name__)
-
-prometheus_client.disable_created_metrics()
-
-# The begin-* and end* here are used by the documentation generator
-# to extract the metrics definitions.
-
-
-# --8<-- [start:metrics-definitions]
-class Metrics:
-    """
-    vLLM uses a multiprocessing-based frontend for the OpenAI server.
-    This means that we need to run prometheus_client in multiprocessing mode
-    See https://prometheus.github.io/client_python/multiprocess/ for more
-    details on limitations.
-    """
-
-    labelname_finish_reason = "finished_reason"
-    labelname_waiting_lora_adapters = "waiting_lora_adapters"
-    labelname_running_lora_adapters = "running_lora_adapters"
-    labelname_max_lora = "max_lora"
-    _gauge_cls = prometheus_client.Gauge
-    _counter_cls = prometheus_client.Counter
-    _histogram_cls = prometheus_client.Histogram
-
-    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
-        # Unregister any existing vLLM collectors (for CI/CD)
-        self._unregister_vllm_metrics()
-
-        max_model_len = vllm_config.model_config.max_model_len
-
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
-
-        # System stats
-        #   Scheduler State
-        self.gauge_scheduler_running = self._gauge_cls(
-            name="vllm:num_requests_running",
-            documentation="Number of requests currently running on GPU.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-        self.gauge_scheduler_waiting = self._gauge_cls(
-            name="vllm:num_requests_waiting",
-            documentation="Number of requests waiting to be processed.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-        self.gauge_lora_info = self._gauge_cls(
-            name="vllm:lora_requests_info",
-            documentation="Running stats on lora requests.",
-            labelnames=[
-                self.labelname_running_lora_adapters,
-                self.labelname_max_lora,
-                self.labelname_waiting_lora_adapters,
-            ],
-            multiprocess_mode="livemostrecent",
-        )
-
-        #   KV Cache Usage in %
-        self.gauge_gpu_cache_usage = self._gauge_cls(
-            name="vllm:gpu_cache_usage_perc",
-            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-
-        # Iteration stats
-        self.counter_num_preemption = self._counter_cls(
-            name="vllm:num_preemptions_total",
-            documentation="Cumulative number of preemption from the engine.",
-            labelnames=labelnames)
-        self.counter_prompt_tokens = self._counter_cls(
-            name="vllm:prompt_tokens_total",
-            documentation="Number of prefill tokens processed.",
-            labelnames=labelnames)
-        self.counter_generation_tokens = self._counter_cls(
-            name="vllm:generation_tokens_total",
-            documentation="Number of generation tokens processed.",
-            labelnames=labelnames)
-        self.histogram_iteration_tokens = self._histogram_cls(
-            name="vllm:iteration_tokens_total",
-            documentation="Histogram of number of tokens per engine_step.",
-            labelnames=labelnames,
-            buckets=[
-                1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384
-            ])
-        self.histogram_time_to_first_token = self._histogram_cls(
-            name="vllm:time_to_first_token_seconds",
-            documentation="Histogram of time to first token in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
-                2560.0
-            ])
-        self.histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation="Histogram of time per output token in seconds.",
-            labelnames=labelnames,
-            buckets=[
-                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
-            ])
-
-        # Request stats
-        #   Latency
-        request_latency_buckets = [
-            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
-        ]
-        self.histogram_e2e_time_request = self._histogram_cls(
-            name="vllm:e2e_request_latency_seconds",
-            documentation="Histogram of end to end request latency in seconds.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_queue_time_request = self._histogram_cls(
-            name="vllm:request_queue_time_seconds",
-            documentation=
-            "Histogram of time spent in WAITING phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_inference_time_request = self._histogram_cls(
-            name="vllm:request_inference_time_seconds",
-            documentation=
-            "Histogram of time spent in RUNNING phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_prefill_time_request = self._histogram_cls(
-            name="vllm:request_prefill_time_seconds",
-            documentation=
-            "Histogram of time spent in PREFILL phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-        self.histogram_decode_time_request = self._histogram_cls(
-            name="vllm:request_decode_time_seconds",
-            documentation=
-            "Histogram of time spent in DECODE phase for request.",
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
-
-        #   Metadata
-        self.histogram_num_prompt_tokens_request = self._histogram_cls(
-            name="vllm:request_prompt_tokens",
-            documentation="Number of prefill tokens processed.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.histogram_num_generation_tokens_request = \
-            self._histogram_cls(
-                name="vllm:request_generation_tokens",
-                documentation="Number of generation tokens processed.",
-                labelnames=labelnames,
-                buckets=build_1_2_5_buckets(max_model_len),
-            )
-        self.histogram_max_num_generation_tokens_request = self._histogram_cls(
-            name="vllm:request_max_num_generation_tokens",
-            documentation=
-            "Histogram of maximum number of requested generation tokens.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len))
-        self.histogram_n_request = self._histogram_cls(
-            name="vllm:request_params_n",
-            documentation="Histogram of the n request parameter.",
-            labelnames=labelnames,
-            buckets=[1, 2, 5, 10, 20],
-        )
-        self.histogram_max_tokens_request = self._histogram_cls(
-            name="vllm:request_params_max_tokens",
-            documentation="Histogram of the max_tokens request parameter.",
-            labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
-        )
-        self.counter_request_success = self._counter_cls(
-            name="vllm:request_success_total",
-            documentation="Count of successfully processed requests.",
-            labelnames=labelnames + [Metrics.labelname_finish_reason])
-
-
-# --8<-- [end:metrics-definitions]
-
-    def _unregister_vllm_metrics(self) -> None:
-        for collector in list(prometheus_client.REGISTRY._collector_to_names):
-            if hasattr(collector, "_name") and "vllm" in collector._name:
-                prometheus_client.REGISTRY.unregister(collector)
-
-
-class _RayGaugeWrapper:
-    """Wraps around ray.util.metrics.Gauge to provide same API as
-    prometheus_client.Gauge"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None,
-                 multiprocess_mode: str = ""):
-        del multiprocess_mode
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        self._gauge = ray_metrics.Gauge(name=name,
-                                        description=documentation,
-                                        tag_keys=labelnames_tuple)
-
-    def labels(self, **labels):
-        self._gauge.set_default_tags(labels)
-        return self
-
-    def set(self, value: Union[int, float]):
-        return self._gauge.set(value)
-
-    def set_to_current_time(self):
-        # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html
-        return self._gauge.set(time.time())
-
-
-class _RayCounterWrapper:
-    """Wraps around ray.util.metrics.Counter to provide same API as
-    prometheus_client.Counter"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        self._counter = ray_metrics.Counter(name=name,
-                                            description=documentation,
-                                            tag_keys=labelnames_tuple)
-
-    def labels(self, **labels):
-        self._counter.set_default_tags(labels)
-        return self
-
-    def inc(self, value: Union[int, float] = 1.0):
-        if value == 0:
-            return
-        return self._counter.inc(value)
-
-
-class _RayHistogramWrapper:
-    """Wraps around ray.util.metrics.Histogram to provide same API as
-    prometheus_client.Histogram"""
-
-    def __init__(self,
-                 name: str,
-                 documentation: str = "",
-                 labelnames: Optional[List[str]] = None,
-                 buckets: Optional[List[float]] = None):
-        labelnames_tuple = tuple(labelnames) if labelnames else None
-        boundaries = buckets if buckets else []
-        self._histogram = ray_metrics.Histogram(name=name,
-                                                description=documentation,
-                                                tag_keys=labelnames_tuple,
-                                                boundaries=boundaries)
-
-    def labels(self, **labels):
-        self._histogram.set_default_tags(labels)
-        return self
-
-    def observe(self, value: Union[int, float]):
-        return self._histogram.observe(value)
-
-
-class RayMetrics(Metrics):
-    """
-    RayMetrics is used by RayPrometheusStatLogger to log to Ray metrics.
-    Provides the same metrics as Metrics but uses Ray's util.metrics library.
-    """
-    _gauge_cls: Type[prometheus_client.Gauge] = cast(
-        Type[prometheus_client.Gauge], _RayGaugeWrapper)
-    _counter_cls: Type[prometheus_client.Counter] = cast(
-        Type[prometheus_client.Counter], _RayCounterWrapper)
-    _histogram_cls: Type[prometheus_client.Histogram] = cast(
-        Type[prometheus_client.Histogram], _RayHistogramWrapper)
-
-    def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
-        if ray_metrics is None:
-            raise ImportError("RayMetrics requires Ray to be installed.")
-        super().__init__(labelnames, vllm_config)
-
-    def _unregister_vllm_metrics(self) -> None:
-        # No-op on purpose
-        pass
-
-
-def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
-    """
-    Builds a list of buckets with increasing powers of 10 multiplied by
-    mantissa values until the value exceeds the specified maximum.
-
-    """
-    exponent = 0
-    buckets: List[int] = []
-    while True:
-        for m in mantissa_lst:
-            value = m * 10**exponent
-            if value <= max_value:
-                buckets.append(value)
-            else:
-                return buckets
-        exponent += 1
-
-
-def build_1_2_5_buckets(max_value: int) -> List[int]:
-    """
-    Example:
-    >>> build_1_2_5_buckets(100)
-    [1, 2, 5, 10, 20, 50, 100]
-    """
-    return build_buckets([1, 2, 5], max_value)
-
-
-def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
-    """
-    Example:
-    >>> build_1_2_3_5_8_buckets(100)
-    [1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
-    """
-    return build_buckets([1, 2, 3, 5, 8], max_value)
-
-
-def local_interval_elapsed(now: float, last_log: float,
-                           local_interval: float) -> bool:
-    elapsed_time = now - last_log
-    return elapsed_time > local_interval
-
-
-def get_throughput(tracked_stats: List[int], now: float,
-                   last_log: float) -> float:
-    return float(np.sum(tracked_stats) / (now - last_log))
-
-
-class LoggingStatLogger(StatLoggerBase):
-    """LoggingStatLogger is used in LLMEngine to log to Stdout."""
-
-    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
-        super().__init__(local_interval, vllm_config)
-        self.last_prompt_throughput: Optional[float] = None
-        self.last_generation_throughput: Optional[float] = None
-
-    def log(self, stats: Stats) -> None:
-        """Called by LLMEngine.
-           Logs to Stdout every self.local_interval seconds."""
-
-        # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
-        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
-
-        # Log locally every local_interval seconds.
-        if local_interval_elapsed(stats.now, self.last_local_log,
-                                  self.local_interval):
-            # Compute summary metrics for tracked stats (and log them
-            # to promethus if applicable).
-            prompt_throughput = get_throughput(self.num_prompt_tokens,
-                                               now=stats.now,
-                                               last_log=self.last_local_log)
-            generation_throughput = get_throughput(
-                self.num_generation_tokens,
-                now=stats.now,
-                last_log=self.last_local_log)
-
-            log_fn = logger.info
-            if not any((prompt_throughput, generation_throughput,
-                        self.last_prompt_throughput,
-                        self.last_generation_throughput)):
-                # Avoid log noise on an idle production system
-                log_fn = logger.debug
-
-            log_fn(
-                "Avg prompt throughput: %.1f tokens/s, "
-                "Avg generation throughput: %.1f tokens/s, "
-                "Running: %d reqs, Swapped: %d reqs, "
-                "Pending: %d reqs, GPU KV cache usage: %.1f%%, "
-                "CPU KV cache usage: %.1f%%.",
-                prompt_throughput,
-                generation_throughput,
-                stats.num_running_sys,
-                stats.num_swapped_sys,
-                stats.num_waiting_sys,
-                stats.gpu_cache_usage_sys * 100,
-                stats.cpu_cache_usage_sys * 100,
-            )
-            if (stats.cpu_prefix_cache_hit_rate >= 0
-                    or stats.gpu_prefix_cache_hit_rate >= 0):
-                log_fn(
-                    "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%",
-                    stats.gpu_prefix_cache_hit_rate * 100,
-                    stats.cpu_prefix_cache_hit_rate * 100,
-                )
-
-            self._reset(stats, prompt_throughput, generation_throughput)
-
-    def _reset(self, stats, prompt_throughput, generation_throughput) -> None:
-        # Reset tracked stats for next interval.
-        self.num_prompt_tokens = []
-        self.num_generation_tokens = []
-        self.last_local_log = stats.now
-        self.last_prompt_throughput = prompt_throughput
-        self.last_generation_throughput = generation_throughput
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
-
-
-class PrometheusStatLogger(StatLoggerBase):
-    """PrometheusStatLogger is used LLMEngine to log to Promethus."""
-    _metrics_cls = Metrics
-    _gauge_cls = prometheus_client.Gauge
-
-    def __init__(self, local_interval: float, labels: Dict[str, str],
-                 vllm_config: VllmConfig) -> None:
-        super().__init__(local_interval, vllm_config)
-        # Prometheus metrics
-        self.labels = labels
-        self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
-                                         vllm_config=vllm_config)
-
-    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
-        # Convenience function for logging to gauge.
-        gauge.labels(**self.labels).set(data)
-
-    def _log_counter(self, counter, data: Union[int, float]) -> None:
-        # Convenience function for logging to counter.
-        # Prevent ValueError from negative increment
-        if data < 0:
-            logger.warning("Skipping negative increment of %g to %s", data,
-                           counter)
-            return
-        counter.labels(**self.labels).inc(data)
-
-    def _log_counter_labels(self, counter, data: CollectionsCounter,
-                            label_key: str) -> None:
-        # Convenience function for collection counter of labels.
-        for label, count in data.items():
-            counter.labels(**{**self.labels, label_key: label}).inc(count)
-
-    def _log_histogram(self, histogram, data: Union[List[int],
-                                                    List[float]]) -> None:
-        # Convenience function for logging list to histogram.
-        for datum in data:
-            histogram.labels(**self.labels).observe(datum)
-
-    def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
-        gauge.labels(**data).set_to_current_time()
-
-    def _log_prometheus(self, stats: Stats) -> None:
-        # System state data
-        self._log_gauge(self.metrics.gauge_scheduler_running,
-                        stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_waiting,
-                        stats.num_waiting_sys)
-        self._log_gauge(self.metrics.gauge_gpu_cache_usage,
-                        stats.gpu_cache_usage_sys)
-        # Including max-lora in metric, in future this property of lora
-        # config maybe extended to be dynamic.
-        lora_info = {
-            self.metrics.labelname_running_lora_adapters:
-            ",".join(stats.running_lora_adapters),
-            self.metrics.labelname_waiting_lora_adapters:
-            ",".join(stats.waiting_lora_adapters),
-            self.metrics.labelname_max_lora:
-            stats.max_lora,
-        }
-        self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
-        # Iteration level data
-        self._log_counter(self.metrics.counter_num_preemption,
-                          stats.num_preemption_iter)
-        self._log_counter(self.metrics.counter_prompt_tokens,
-                          stats.num_prompt_tokens_iter)
-        self._log_counter(self.metrics.counter_generation_tokens,
-                          stats.num_generation_tokens_iter)
-        self._log_histogram(self.metrics.histogram_iteration_tokens,
-                            [stats.num_tokens_iter])
-        self._log_histogram(self.metrics.histogram_time_to_first_token,
-                            stats.time_to_first_tokens_iter)
-        self._log_histogram(self.metrics.histogram_time_per_output_token,
-                            stats.time_per_output_tokens_iter)
-
-        # Request level data
-        # Latency
-        self._log_histogram(self.metrics.histogram_e2e_time_request,
-                            stats.time_e2e_requests)
-        self._log_histogram(self.metrics.histogram_queue_time_request,
-                            stats.time_queue_requests)
-        self._log_histogram(self.metrics.histogram_inference_time_request,
-                            stats.time_inference_requests)
-        self._log_histogram(self.metrics.histogram_prefill_time_request,
-                            stats.time_prefill_requests)
-        self._log_histogram(self.metrics.histogram_decode_time_request,
-                            stats.time_decode_requests)
-        # Metadata
-        finished_reason_counter = CollectionsCounter(
-            stats.finished_reason_requests)
-        self._log_counter_labels(self.metrics.counter_request_success,
-                                 finished_reason_counter,
-                                 Metrics.labelname_finish_reason)
-        self._log_histogram(self.metrics.histogram_num_prompt_tokens_request,
-                            stats.num_prompt_tokens_requests)
-        self._log_histogram(
-            self.metrics.histogram_num_generation_tokens_request,
-            stats.num_generation_tokens_requests)
-        self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
-        self._log_histogram(
-            self.metrics.histogram_max_num_generation_tokens_request,
-            stats.max_num_generation_tokens_requests)
-        self._log_histogram(self.metrics.histogram_max_tokens_request,
-                            stats.max_tokens_requests)
-
-    def log(self, stats: Stats):
-        """Logs to prometheus and tracked stats every iteration."""
-        # Log to prometheus.
-        self._log_prometheus(stats)
-
-        # Save tracked stats for token counters.
-        self.num_prompt_tokens.append(stats.num_prompt_tokens_iter)
-        self.num_generation_tokens.append(stats.num_generation_tokens_iter)
-
-        # Log locally every local_interval seconds.
-        if local_interval_elapsed(stats.now, self.last_local_log,
-                                  self.local_interval):
-
-            # Reset tracked stats for next interval.
-            self.num_prompt_tokens = []
-            self.num_generation_tokens = []
-            self.last_local_log = stats.now
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        # Info type metrics are syntactic sugar for a gauge permanently set to 1
-        # Since prometheus multiprocessing mode does not support Info, emulate
-        # info here with a gauge.
-        if type == "cache_config":
-            metrics_info = obj.metrics_info()
-            info_gauge = self._gauge_cls(
-                name="vllm:cache_config_info",
-                documentation="Information of the LLMEngine CacheConfig",
-                labelnames=metrics_info.keys(),
-                multiprocess_mode="mostrecent")
-            info_gauge.labels(**metrics_info).set(1)
-
-
-class RayPrometheusStatLogger(PrometheusStatLogger):
-    """RayPrometheusStatLogger uses Ray metrics instead."""
-    _metrics_cls = RayMetrics
-
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        return None
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
deleted file mode 100644
index 3281a9121a9d..000000000000
--- a/vllm/engine/metrics_types.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-These types are defined in this file to avoid importing vllm.engine.metrics
-and therefore importing prometheus_client.
-
-This is required due to usage of Prometheus multiprocess mode to enable 
-metrics after splitting out the uvicorn process from the engine process.
-
-Prometheus multiprocess mode requires setting PROMETHEUS_MULTIPROC_DIR
-before prometheus_client is imported. Typically, this is done by setting
-the env variable before launch, but since we are a library, we need to
-do this in Python code and lazily import prometheus_client.
-"""
-
-import time
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import List
-
-from vllm.config import SupportsMetricsInfo, VllmConfig
-
-
-@dataclass
-class Stats:
-    """Created by LLMEngine for use by StatLogger."""
-    now: float
-
-    # System stats (should have _sys suffix)
-    #   Scheduler State
-    num_running_sys: int
-    num_waiting_sys: int
-    num_swapped_sys: int
-    #   KV Cache Usage in %
-    gpu_cache_usage_sys: float
-    cpu_cache_usage_sys: float
-    #   Prefix caching block hit rate
-    cpu_prefix_cache_hit_rate: float
-    gpu_prefix_cache_hit_rate: float
-
-    # Iteration stats (should have _iter suffix)
-    num_prompt_tokens_iter: int
-    num_generation_tokens_iter: int
-    num_tokens_iter: int
-    time_to_first_tokens_iter: List[float]
-    time_per_output_tokens_iter: List[float]
-    num_preemption_iter: int
-
-    # Request stats (should have _requests suffix)
-    #   Latency
-    time_e2e_requests: List[float]
-    time_queue_requests: List[float]
-    time_inference_requests: List[float]
-    time_prefill_requests: List[float]
-    time_decode_requests: List[float]
-    #   Metadata
-    num_prompt_tokens_requests: List[int]
-    num_generation_tokens_requests: List[int]
-    n_requests: List[int]
-    max_num_generation_tokens_requests: List[int]
-    max_tokens_requests: List[int]
-    finished_reason_requests: List[str]
-    waiting_lora_adapters: List[str]
-    running_lora_adapters: List[str]
-    max_lora: str
-
-
-class StatLoggerBase(ABC):
-    """Base class for StatLogger."""
-
-    def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
-        # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: List[int] = []
-        self.num_generation_tokens: List[int] = []
-        self.last_local_log = time.time()
-        self.local_interval = local_interval
-
-    @abstractmethod
-    def log(self, stats: Stats) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def info(self, type: str, obj: SupportsMetricsInfo) -> None:
-        raise NotImplementedError
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
deleted file mode 100644
index ff0405d2f843..000000000000
--- a/vllm/engine/multiprocessing/__init__.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import uuid
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import List, Mapping, Optional, Union
-
-from vllm import PoolingParams
-from vllm.inputs import PromptType
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
-from vllm.sampling_params import SamplingParams
-from vllm.utils import Device
-
-VLLM_RPC_SUCCESS_STR = "SUCCESS"
-
-IPC_INPUT_EXT = "_input_socket"
-IPC_OUTPUT_EXT = "_output_socket"
-IPC_HEALTH_EXT = "_health_socket"
-IPC_DATA_EXT = "_data_socket"
-
-
-class MQEngineDeadError(RuntimeError):
-    pass
-
-
-@dataclass
-class RPCProcessRequest:
-    prompt: PromptType
-    params: Union[SamplingParams, PoolingParams]
-    request_id: str
-    lora_request: Optional[LoRARequest] = None
-    trace_headers: Optional[Mapping[str, str]] = None
-    priority: int = 0
-
-    def __init__(
-        self,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> None:
-        super().__init__()
-
-        self.prompt = prompt
-        self.params = params
-        self.request_id = request_id
-        self.lora_request = lora_request
-        self.trace_headers = trace_headers
-        self.priority = priority
-
-
-@dataclass
-class RPCError:
-    request_id: Optional[str]
-    is_engine_errored: bool
-    exception: BaseException
-
-
-@dataclass
-class RPCAbortRequest:
-    request_id: str
-
-
-class RPCStartupRequest(Enum):
-    IS_SERVER_READY = 1
-
-
-@dataclass
-class RPCStartupResponse:
-    tracing_enabled: bool
-
-
-class RPCUProfileRequest(Enum):
-    START_PROFILE = 1
-    STOP_PROFILE = 2
-
-
-class RPCResetMultiModalCacheRequest(Enum):
-    RESET = 1
-
-
-@dataclass
-class RPCResetPrefixCacheRequest:
-    device: Device
-
-
-class RPCSleepRequest(Enum):
-    SLEEP_LEVEL_1 = 1
-    SLEEP_LEVEL_2 = 2
-
-
-@dataclass
-class RPCWakeUpRequest:
-    tags: Optional[list[str]] = None
-
-
-@dataclass
-class RPCIsSleepingRequest:
-    # Set the default value of request_id to a new UUID
-    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
-
-
-@dataclass
-class RPCIsSleepingResponse:
-    request_id: str
-    is_sleeping: bool
-
-
-@dataclass
-class RPCLoadAdapterRequest:
-    lora_request: LoRARequest
-    # Set the default value of request_id to a new UUID
-    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
-
-
-@dataclass
-class RPCAdapterLoadedResponse:
-    request_id: str
-
-
-RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
-                      RPCUProfileRequest, RPCLoadAdapterRequest,
-                      RPCResetMultiModalCacheRequest,
-                      RPCResetPrefixCacheRequest, RPCSleepRequest,
-                      RPCWakeUpRequest, RPCIsSleepingRequest]
-
-REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
-                          RPCIsSleepingResponse, RPCError]
-
-
-def ENGINE_DEAD_ERROR(
-        error: Optional[BaseException] = None) -> MQEngineDeadError:
-    if error is None:
-        return MQEngineDeadError(
-            "Engine loop is not running. Inspect the stacktrace to "
-            "find the original error")
-
-    return MQEngineDeadError(
-        "Engine loop is not running. Inspect the stacktrace to "
-        f"find the original error: {repr(error)}.")
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
deleted file mode 100644
index cde8fc367fb5..000000000000
--- a/vllm/engine/multiprocessing/client.py
+++ /dev/null
@@ -1,682 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import copy
-import pickle
-from contextlib import contextmanager, suppress
-from typing import (Any, AsyncGenerator, Dict, Iterator, List, Mapping,
-                    Optional, Union, cast)
-
-import cloudpickle
-import psutil
-import zmq
-import zmq.asyncio
-from zmq import Frame  # type: ignore[attr-defined]
-from zmq.asyncio import Socket
-
-from vllm import PoolingParams
-from vllm.config import DecodingConfig, ModelConfig, VllmConfig
-from vllm.core.scheduler import SchedulerOutputs
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.engine.async_llm_engine import (
-    build_guided_decoding_logits_processor_async)
-from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
-                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
-                                         IPC_OUTPUT_EXT, RPC_REQUEST_T,
-                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCAdapterLoadedResponse, RPCError,
-                                         RPCIsSleepingRequest,
-                                         RPCIsSleepingResponse,
-                                         RPCLoadAdapterRequest,
-                                         RPCProcessRequest,
-                                         RPCResetMultiModalCacheRequest,
-                                         RPCResetPrefixCacheRequest,
-                                         RPCSleepRequest, RPCStartupRequest,
-                                         RPCStartupResponse,
-                                         RPCUProfileRequest, RPCWakeUpRequest)
-from vllm.engine.protocol import EngineClient
-# yapf: enable
-from vllm.envs import VLLM_RPC_TIMEOUT
-from vllm.inputs import PromptType
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.outputs import PoolingRequestOutput, RequestOutput
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import Device
-
-logger = init_logger(__name__)
-
-
-class MQClientClosedError(Exception):
-    """Exception class raised when the client is used post-close.
-
-    The client can be closed, which closes the ZMQ context. This normally
-    happens on server shutdown. In some cases, methods like abort and
-    do_log_stats will still be called and then try to open a socket, which
-    causes a ZMQError and creates a huge stack trace.
-    So, we throw this error such that we can suppress it.
-    """
-
-
-class MQLLMEngineClient(EngineClient):
-    """A client wrapper for MQLLMEngine that conforms to the
-    EngineClient protocol.
-
-    MQLLMEngine and MQLLMEngineClient are intended to run in separate
-    processes communicating via zeromq ipc sockets.
-
-    The entrypoint to MQLLMEngineClient is through the generate()
-    method. On generate() MQLLMEngine does three things:
-        - Creates an asyncio output queue
-        - Sends a RPCGenerateRequest to the MQLLMEngine via zmq
-        - Pulls RequestOutputs from its queue and yields them
-
-    MQLLMEngine runs two background loops:
-        - output_loop: the output loop pulls List[RequestOutput]
-            from the MQLLMEngine via zmq (each list is the output
-            of one engine_step in the LLMEngine). It then parses
-            the list and pushes individual request_outputs into
-            the corresponding output_queue such that they can be
-            consumed by the .generate() method.
-        - health_loop: the health loop queries the health socket
-            every N seconds, confirming the engine is healthy
-    """
-
-    def __init__(self, ipc_path: str, engine_config: VllmConfig,
-                 engine_pid: int):
-        self.context = zmq.asyncio.Context()
-        self._errored_with: Optional[BaseException] = None
-
-        # Get the configs.
-        self.vllm_config = engine_config
-        self.model_config = engine_config.model_config
-        self.decoding_config = engine_config.decoding_config
-
-        if self.vllm_config.model_config.skip_tokenizer_init:
-            self.tokenizer = None
-
-        else:
-            # Create the tokenizer group.
-            self.tokenizer = init_tokenizer_from_configs(
-                model_config=self.model_config,
-                scheduler_config=engine_config.scheduler_config,
-                lora_config=engine_config.lora_config)
-
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer)
-
-        # Send RPCGenerateRequest to the MQLLMEngine.
-        self.input_socket: Socket = self.context.socket(zmq.constants.PUSH)
-        self.input_socket.connect(f"{ipc_path}{IPC_INPUT_EXT}")
-
-        # Receive streams of RequestOutput from the MQLLMEngine.
-        self.output_socket: Socket = self.context.socket(zmq.constants.PULL)
-        self.output_socket.connect(f"{ipc_path}{IPC_OUTPUT_EXT}")
-
-        # IPC path for acking heartbeats.
-        self.heartbeat_socket: Socket = self.context.socket(zmq.constants.PULL)
-        self.heartbeat_socket.connect(f"{ipc_path}{IPC_HEALTH_EXT}")
-
-        # IPC path for the data socket.
-        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
-
-        # Stream for each individual request.
-        self.output_queues: Dict[str, asyncio.Queue] = {}
-
-        # Loop to handle output of the LLMEngine periodically.
-        # Started after the MQLLMEngine is ready so that we can
-        # build the Client in an executor to enable clean shutdown.
-        self.output_loop: Optional[asyncio.Task] = None
-
-        # Loop to check health of the LLMEngine periodically.
-        # Started after the MQLLMEngine is ready.
-        self.health_loop: Optional[asyncio.Task] = None
-        self._engine_process = psutil.Process(engine_pid)
-
-    @staticmethod
-    def is_unsupported_config(vllm_config: VllmConfig):
-        # Pipeline parallel not yet supported
-        return vllm_config.parallel_config.pipeline_parallel_size > 1
-
-    @contextmanager
-    def get_data_socket(self) -> Iterator[Socket]:
-        socket = self.context.socket(zmq.constants.DEALER)
-        try:
-            socket.connect(self.data_ipc_path)
-            yield socket
-        finally:
-            socket.close(linger=0)
-
-    async def run_heartbeat_loop(self, timeout: int):
-        """Background loop that continually checks to ensure the engine process
-        is still alive.
-        """
-        try:
-            while True:
-                # Check if the engine process is running:
-                if not self._engine_process.is_running() or (
-                        self._engine_process.status() == psutil.STATUS_ZOMBIE):
-                    # NB: is_running() returns True for zombies
-                    self._set_errored(
-                        RuntimeError(
-                            f"Engine process (pid {self._engine_process.pid}) "
-                            "died."))
-                    break
-
-                if await self.heartbeat_socket.poll(timeout=timeout):
-                    # Heartbeat received- check the message
-                    await self._check_success(
-                        error_message="Heartbeat failed.",
-                        socket=self.heartbeat_socket)
-
-                logger.debug("Heartbeat successful.")
-
-        except asyncio.CancelledError:
-            logger.debug("Shutting down MQLLMEngineClient check health loop.")
-
-        except psutil.NoSuchProcess:
-            self._set_errored(
-                RuntimeError(
-                    f"Engine process (pid {self._engine_process.pid}) died."))
-
-        except Exception as e:
-            self._set_errored(e)
-
-    async def run_output_handler_loop(self):
-        """Get RequestOutputs from Engine and stream to Request Queues"""
-
-        try:
-            while True:
-                # Poll, checking for ENGINE_DEAD
-                while await self.output_socket.poll(timeout=VLLM_RPC_TIMEOUT
-                                                    ) == 0:
-                    logger.debug("Waiting for output from MQLLMEngine.")
-
-                    # If errored, alert all running requests.
-                    if self.errored:
-                        for queue_j in tuple(self.output_queues.values()):
-                            queue_j.put_nowait(
-                                ENGINE_DEAD_ERROR(self._errored_with))
-                        return
-
-                message: Frame = await self.output_socket.recv(copy=False)
-                request_outputs = pickle.loads(message.buffer)
-
-                is_error = isinstance(request_outputs,
-                                      (BaseException, RPCError))
-                if is_error:
-                    if isinstance(request_outputs, RPCError):
-                        rpc_error: RPCError = request_outputs
-                        request_id = rpc_error.request_id
-                        exception = rpc_error.exception
-                        is_engine_errored = rpc_error.is_engine_errored
-                    else:
-                        # MPLLMEngine should always return an RPCError to
-                        # the output_socket when an issue arises.
-                        # If we are here, we are in a bad state and
-                        # should shut down the server.
-                        error: BaseException = request_outputs
-                        logger.error(
-                            "Received Exception %s rather than RPCError from "
-                            "MPLLMEngine. This should never happen.", error)
-                        request_id = None
-                        exception = error
-                        is_engine_errored = True
-
-                    # Set to error state only on engine critical error
-                    # (and record only the first one)
-                    if is_engine_errored and not self._errored_with:
-                        self._errored_with = exception
-                        # If engine is errored, no matter the type of exception
-                        # it will no longer be able to receive new requests,
-                        # therefore we have to inform that the current
-                        # processed requests failed as well. Send back a dead
-                        # engine error give this feedback and also give a
-                        # 'hint' to the server to shutdown next.
-                        exception = self.dead_error
-
-                    if request_id is None:
-                        # If request_id is None, then the engine raised an
-                        # exception for a batch, and we may not know the
-                        # request that caused it, neither if it was actually
-                        # caused by any of them (e.g. CUDA OOM). Therefore we
-                        # broadcast the same exception for all requests.
-                        for queue_i in tuple(self.output_queues.values()):
-                            queue_i.put_nowait(exception)
-                    else:
-                        queue = self.output_queues.get(request_id)
-                        if queue is not None:
-                            queue.put_nowait(exception)
-                # Put each output into the appropriate queue.
-                elif isinstance(
-                        request_outputs,
-                    (RPCAdapterLoadedResponse, RPCIsSleepingResponse)):
-                    self._add_output(request_outputs)
-                else:
-                    for request_output in request_outputs:
-                        self._add_output(request_output)
-
-        except asyncio.CancelledError:
-            logger.debug("Shutting down MQLLMEngineClient output handler.")
-
-    def _add_output(self, request_output: Union[RequestOutput,
-                                                RPCAdapterLoadedResponse,
-                                                RPCIsSleepingResponse]):
-        queue = self.output_queues.get(request_output.request_id)
-        if queue is not None:
-            queue.put_nowait(request_output)
-
-    async def setup(self):
-        """Setup the client before it starts sending server requests."""
-
-        # Start output_loop
-        if self.output_loop is None:
-            # only generate once to avoid multiple concurrent output_loops
-            # this will lead to race conditions and wrong orders of tokens
-            # returned by the engine
-            # setup will be called multiple times during the startup of
-            # the engine
-            self.output_loop = asyncio.create_task(
-                self.run_output_handler_loop())
-
-        with self.get_data_socket() as socket:
-            # Wait until server is ready.
-            response = await self._wait_for_server_rpc(socket)
-
-            self.tracing_flag = response.tracing_enabled
-
-            # Start health_loop.
-            if self.health_loop is None:
-                self.health_loop = asyncio.create_task(
-                    self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT))
-
-    def close(self):
-        """Destroy the ZeroMQ Context."""
-        # Close all sockets and terminate the context.
-        self.context.destroy(linger=0)
-
-        # Cancel background tasks.
-        if self.health_loop is not None:
-            self.health_loop.cancel()
-        if self.output_loop is not None:
-            self.output_loop.cancel()
-
-    def _set_errored(self, e: BaseException):
-        logger.exception(repr(e))
-        if self._errored_with is None:
-            self._errored_with = e
-
-    @staticmethod
-    async def _send_get_data_rpc_request(request: RPCStartupRequest,
-                                         expected_type: Any,
-                                         error_message: str,
-                                         socket: Socket) -> Any:
-        """Send an RPC request that is expecting data back."""
-
-        # Ping RPCServer with a request.
-        await socket.send_multipart((pickle.dumps(request), ), copy=False)
-
-        # Make sure the server responds in time.
-        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
-            raise TimeoutError("RPCServer didn't reply within "
-                               f"{VLLM_RPC_TIMEOUT} ms")
-
-        # Await the data from the Server.
-        frame = await socket.recv(copy=False)
-        data = pickle.loads(frame.buffer)
-
-        if isinstance(data, BaseException):
-            raise data
-        elif not isinstance(data, expected_type):
-            raise ValueError(error_message)
-
-        return data
-
-    @staticmethod
-    async def _send_one_way_rpc_request(request: RPC_REQUEST_T,
-                                        socket: Socket):
-        """Send one-way RPC request to trigger an action."""
-
-        if socket.closed:
-            raise MQClientClosedError()
-
-        await socket.send_multipart((pickle.dumps(request), ))
-
-    async def _await_ack(self, error_message: str, socket: Socket):
-        """Await acknowledgement that a request succeeded."""
-
-        if socket.closed:
-            raise MQClientClosedError()
-
-        if await socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
-            raise TimeoutError("MQLLMEngine didn't reply within "
-                               f"{VLLM_RPC_TIMEOUT}ms")
-
-        await self._check_success(error_message, socket)
-
-    @staticmethod
-    async def _check_success(error_message: str, socket: Socket):
-        """Confirm that socket has a VLLM_RPC_SUCCESS_STR message"""
-
-        if socket.closed:
-            raise MQClientClosedError()
-
-        frame = await socket.recv(copy=False)
-        response = pickle.loads(frame.buffer)
-
-        # Raise error if unsuccessful
-        if isinstance(response, BaseException):
-            raise response
-        elif (not isinstance(response, str)
-              or response != VLLM_RPC_SUCCESS_STR):
-            raise ValueError(error_message)
-
-    async def get_input_preprocessor(self) -> InputPreprocessor:
-        return self.input_preprocessor
-
-    async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
-        if self.tokenizer is None:
-            return None
-        else:
-            return await self.tokenizer.get_lora_tokenizer_async(lora_request)
-
-    async def get_vllm_config(self) -> VllmConfig:
-        return self.vllm_config
-
-    async def get_decoding_config(self) -> DecodingConfig:
-        return self.decoding_config
-
-    async def get_model_config(self) -> ModelConfig:
-        return self.model_config
-
-    async def is_tracing_enabled(self) -> bool:
-        return self.tracing_flag
-
-    async def _wait_for_server_rpc(self, socket: Socket) -> RPCStartupResponse:
-        """Wait for the RPCServer to start up."""
-
-        return await self._send_get_data_rpc_request(
-            request=RPCStartupRequest.IS_SERVER_READY,
-            expected_type=RPCStartupResponse,
-            error_message="Unable to start RPC Server",
-            socket=socket)
-
-    async def abort(self, request_id: str):
-        """Send an ABORT_REQUEST signal to the RPC Server"""
-
-        with suppress(MQClientClosedError):
-            await self._send_one_way_rpc_request(
-                request=RPCAbortRequest(request_id), socket=self.input_socket)
-
-    async def do_log_stats(
-        self,
-        scheduler_outputs: Optional[SchedulerOutputs] = None,
-        model_output: Optional[List[SamplerOutput]] = None,
-    ) -> None:
-        """
-        Ignore do_log_stats (handled on MQLLMEngine polling)
-        """
-        pass
-
-    async def check_health(self):
-        """
-        The check health loop probes the health status of the
-        Engine's health every N seconds and sets _errored_with
-        if the engine is unhealthy.
-        """
-        if self._errored_with is not None:
-            raise self._errored_with
-
-    @property
-    def is_running(self) -> bool:
-        return not self.errored
-
-    @property
-    def is_stopped(self) -> bool:
-        return self.errored
-
-    @property
-    def errored(self) -> bool:
-        return self._errored_with is not None
-
-    @property
-    def dead_error(self) -> BaseException:
-        return ENGINE_DEAD_ERROR(self._errored_with)
-
-    def generate(
-        self,
-        prompt: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        """Generate outputs for a request.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            sampling_params: The sampling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: Priority of the request (lower means earlier handling).
-                Any priority other than 0 will lead to an error if the
-                scheduling policy is not "priority".
-        """
-        return cast(
-            AsyncGenerator[RequestOutput, None],
-            self._process_request(prompt, sampling_params, request_id,
-                                  lora_request, trace_headers, priority))
-
-    def encode(
-        self,
-        prompt: PromptType,
-        pooling_params: PoolingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-        """
-        return cast(
-            AsyncGenerator[PoolingRequestOutput, None],
-            self._process_request(prompt,
-                                  pooling_params,
-                                  request_id,
-                                  lora_request,
-                                  trace_headers,
-                                  priority=priority))
-
-    async def _process_request(
-        self,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        priority: int = 0,
-    ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
-            PoolingRequestOutput, None]]:
-        """Send an RPCGenerateRequest to the RPCServer and stream responses."""
-
-        # If already dead, error out.
-        if self._errored_with is not None:
-            raise ENGINE_DEAD_ERROR(self._errored_with)
-
-        # Ensure the request id is unique among running requests
-        if request_id in self.output_queues:
-            raise ValueError(f"Request {request_id} already exists")
-
-        # Constructing guided decoding logits processors is expensive, so we do
-        # it here to avoid contending with cpu resources and the GIL on the
-        # backend process.
-        if isinstance(params, SamplingParams) and \
-            params.guided_decoding is not None:
-            params = await \
-                build_guided_decoding_logits_processor_async(
-                    sampling_params=params,
-                    tokenizer=await self.get_tokenizer(lora_request),
-                    default_guided_backend=(self.decoding_config.backend
-                        if self.decoding_config
-                        else DecodingConfig.backend),
-                    model_config=self.model_config,
-                    reasoning_backend=self.decoding_config.reasoning_backend,
-                )
-
-        # 1) Create output queue for this requests.
-        queue: asyncio.Queue[Union[RequestOutput,
-                                   BaseException]] = asyncio.Queue()
-        self.output_queues[request_id] = queue
-
-        try:
-            # 2) Detach logits processors so that they can be pickled
-            # separately (may require cloudpickle which is slower)
-            if isinstance(params, SamplingParams) and params.logits_processors:
-                # Defensive shallow copy
-                params = copy.copy(params)
-                logits_processors = params.logits_processors
-                params.logits_processors = None
-                lp_bytes = cloudpickle.dumps(logits_processors)
-            else:
-                lp_bytes = None
-
-            request_bytes = pickle.dumps(
-                RPCProcessRequest(
-                    prompt=prompt,
-                    params=params,
-                    request_id=request_id,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                ))
-
-            # 3) Send the RPCGenerateRequest to the MQLLMEngine.
-            parts = (request_bytes,
-                     lp_bytes) if lp_bytes else (request_bytes, )
-            await self.input_socket.send_multipart(parts, copy=False)
-
-            # 4) Stream the RequestOutputs from the output queue. Note
-            # that the output_loop pushes RequestOutput objects to this
-            # queue after pulling them from the zmq socket.
-            finished = False
-            try:
-                while not finished:
-                    request_output = await queue.get()
-
-                    if isinstance(request_output, BaseException):
-                        raise request_output
-
-                    finished = request_output.finished
-                    yield request_output
-            finally:
-                # Request was canceled by the client.
-                if not finished and not self.errored:
-                    await self.abort(request_id)
-        finally:
-            self.output_queues.pop(request_id)
-
-    async def start_profile(self) -> None:
-        """Start profiling the engine"""
-
-        await self._send_one_way_rpc_request(
-            request=RPCUProfileRequest.START_PROFILE, socket=self.input_socket)
-
-    async def stop_profile(self) -> None:
-        """Stop profiling the engine"""
-
-        await self._send_one_way_rpc_request(
-            request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
-
-    async def reset_mm_cache(self) -> None:
-        """Reset the multi-modal cache"""
-
-        await self._send_one_way_rpc_request(
-            request=RPCResetMultiModalCacheRequest.RESET,
-            socket=self.input_socket)
-
-    async def reset_prefix_cache(self,
-                                 device: Optional[Device] = None) -> None:
-        """Reset the prefix cache"""
-
-        await self._send_one_way_rpc_request(
-            request=RPCResetPrefixCacheRequest(device),
-            socket=self.input_socket)
-
-    async def sleep(self, level: int = 1) -> None:
-        """Sleep the engine for a given level"""
-        return await self._send_one_way_rpc_request(
-            request=RPCSleepRequest(level), socket=self.input_socket)
-
-    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        """Wake up the engine"""
-        return await self._send_one_way_rpc_request(
-            request=RPCWakeUpRequest(tags), socket=self.input_socket)
-
-    async def is_sleeping(self) -> bool:
-        """Check whether the engine is sleeping"""
-        request = RPCIsSleepingRequest()
-
-        queue: asyncio.Queue[Union[BaseException,
-                                   RPCIsSleepingResponse]] = asyncio.Queue()
-        self.output_queues[request.request_id] = queue
-
-        request_bytes = pickle.dumps(request)
-        await self.input_socket.send_multipart((request_bytes, ), copy=False)
-
-        request_output = await queue.get()
-        self.output_queues.pop(request.request_id)
-
-        if isinstance(request_output, BaseException):
-            raise request_output
-        return request_output.is_sleeping
-
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        """Load a new LoRA adapter into the engine for future requests."""
-        # Uses the same I/O as generate requests
-        request = RPCLoadAdapterRequest(lora_request)
-
-        # Create output queue for this requests.
-        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
-        self.output_queues[request.request_id] = queue
-
-        # Send the request
-        request_bytes = pickle.dumps(request)
-        await self.input_socket.send_multipart((request_bytes, ), copy=False)
-
-        # Wait for the response
-        request_output = await queue.get()
-        self.output_queues.pop(request.request_id)
-
-        # Raise on error, otherwise happily return None
-        if isinstance(request_output, BaseException):
-            raise request_output
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
deleted file mode 100644
index fe6eb0d8c2f1..000000000000
--- a/vllm/engine/multiprocessing/engine.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pickle
-import signal
-from contextlib import contextmanager
-from typing import Iterator, List, Optional, Union
-
-import cloudpickle
-import zmq
-
-from vllm import AsyncEngineArgs, SamplingParams
-from vllm.config import VllmConfig
-from vllm.engine.llm_engine import LLMEngine
-# yapf conflicts with isort for this block
-# yapf: disable
-from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
-                                         IPC_HEALTH_EXT, IPC_INPUT_EXT,
-                                         IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
-                                         VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
-                                         RPCAdapterLoadedResponse, RPCError,
-                                         RPCIsSleepingRequest,
-                                         RPCIsSleepingResponse,
-                                         RPCLoadAdapterRequest,
-                                         RPCProcessRequest,
-                                         RPCResetMultiModalCacheRequest,
-                                         RPCResetPrefixCacheRequest,
-                                         RPCSleepRequest, RPCStartupRequest,
-                                         RPCStartupResponse,
-                                         RPCUProfileRequest, RPCWakeUpRequest)
-# yapf: enable
-from vllm.logger import init_logger
-from vllm.outputs import RequestOutput
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
-from vllm.usage.usage_lib import UsageContext
-from vllm.worker.model_runner_base import InputProcessingError
-
-logger = init_logger(__name__)
-
-POLLING_TIMEOUT_MS = 10000
-HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
-
-
-class MQLLMEngine:
-    """A multiprocessing wrapper for
-    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-
-    This class is used to wrap the
-    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
-    in concurrnet manner. It runs a background loop and uses zeromq to
-    receive new requests and stream outputs incrementally via ipc.
-
-    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
-    process is kicked off when a new RPCProcessRequest is received by the
-    input_socket.
-
-    The self.engine_loop checks the input_socket for new requests,
-    adds them to the LLMEngine if there are any, calls the internal
-    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
-    the RequestOutputs back over the output_socket.
-
-    If use_async_sockets is set, the logic associated with reading new
-    requests from the socket and sending data to the socket is passed
-    as a callback to the llm_engine, which calls the logic asynchronously
-    such that the IPC can be overlapped with the GPU.
-
-    Args:
-        ipc_path: Base path for zeromq interprocess messaging
-        use_async_sockets: Whether to make send/recv async with GPU
-        log_requests: Whether to log the requests.
-        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
-    """
-
-    def __init__(self,
-                 ipc_path: str,
-                 use_async_sockets: bool,
-                 *args,
-                 log_requests: bool = True,
-                 **kwargs) -> None:
-        # For MQLLMEngine, we can use cached outputs, since each new request
-        # output is immediately pickled and send over the socket, which frees
-        # the python object to be reused again.
-        kwargs['use_cached_outputs'] = True
-
-        self.engine = LLMEngine(*args, **kwargs)
-        self.log_requests = log_requests
-
-        self.use_async_sockets = use_async_sockets
-        if self.use_async_sockets:
-            self.engine.process_request_outputs_callback = \
-                self._async_socket_engine_callback
-
-        self.ctx = zmq.Context()  # type: ignore[attr-defined]
-
-        # Receive input from the client.
-        self.input_socket = self.ctx.socket(zmq.constants.PULL)
-        self.input_socket.bind(f"{ipc_path}{IPC_INPUT_EXT}")
-
-        # Send output stream back to client.
-        self.output_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.output_socket.bind(f"{ipc_path}{IPC_OUTPUT_EXT}")
-
-        # Send heartbeats back to client.
-        self.heartbeat_socket = self.ctx.socket(zmq.constants.PUSH)
-        self.heartbeat_socket.bind(f"{ipc_path}{IPC_HEALTH_EXT}")
-
-        # IPC path for the data socket.
-        self.data_ipc_path = f"{ipc_path}{IPC_DATA_EXT}"
-
-        # Error state.
-        self._errored_with: Optional[BaseException] = None
-
-    @property
-    def dead_error(self) -> BaseException:
-        if self._errored_with is not None:
-            return ENGINE_DEAD_ERROR(self._errored_with)
-        else:
-            return ENGINE_DEAD_ERROR()
-
-    @classmethod
-    def from_vllm_config(cls, vllm_config: VllmConfig,
-                         usage_context: UsageContext,
-                         disable_log_requests: bool, disable_log_stats: bool,
-                         ipc_path: str) -> "MQLLMEngine":
-        # Setup plugins for each process
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
-        use_async_sockets = vllm_config.model_config.use_async_output_proc
-
-        return cls(
-            vllm_config=vllm_config,
-            executor_class=LLMEngine._get_executor_cls(vllm_config),
-            ipc_path=ipc_path,
-            usage_context=usage_context,
-            use_async_sockets=use_async_sockets,
-            log_requests=(not disable_log_requests),
-            log_stats=(not disable_log_stats),
-        )
-
-    @staticmethod
-    def from_engine_args(engine_args: AsyncEngineArgs,
-                         usage_context: UsageContext, ipc_path: str):
-        """Creates an MQLLMEngine from the engine arguments."""
-
-        vllm_config = engine_args.create_engine_config(usage_context)
-        return MQLLMEngine.from_vllm_config(
-            ipc_path=ipc_path,
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            disable_log_requests=engine_args.disable_log_requests,
-            disable_log_stats=engine_args.disable_log_stats,
-        )
-
-    def start(self):
-        try:
-            try:
-                logger.debug("Starting Startup Loop.")
-                self.run_startup_loop()
-                logger.debug("Starting Engine Loop.")
-                self.run_engine_loop()
-            except Exception as e:
-                logger.exception(repr(e))
-        except KeyboardInterrupt:
-            logger.debug("Shutting down MQLLMEngine.")
-        finally:
-            logger.debug("MQLLMEngine is shut down.")
-            self.cleanup()
-
-    def cleanup(self):
-        """Cleanup zeromq state on shutdown."""
-        # Closes all sockets and destroys context.
-        self.ctx.destroy(linger=0)
-        del self.engine
-
-    @contextmanager
-    def make_data_socket(
-            self) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
-        socket = self.ctx.socket(zmq.constants.ROUTER)
-        try:
-            socket.bind(self.data_ipc_path)
-            yield socket
-        finally:
-            socket.close(linger=0)
-
-    def run_startup_loop(self) -> None:
-        """Startup loop for sending data from Engine -> Client."""
-
-        with self.make_data_socket() as socket:
-            response: Union[RPCStartupResponse, BaseException]
-            try:
-                identity, message = socket.recv_multipart(copy=False)
-                request: RPCStartupRequest = pickle.loads(message.buffer)
-
-                # Handle the query from the Client.
-                if request == RPCStartupRequest.IS_SERVER_READY:
-                    tracing_enabled = self.engine.is_tracing_enabled()
-                    response = RPCStartupResponse(
-                        tracing_enabled=tracing_enabled)
-
-            except Exception as e:
-                response = e
-
-            socket.send_multipart((identity, pickle.dumps(response)),
-                                  copy=False)
-
-    def run_engine_loop(self):
-        """Core busy loop of the LLMEngine."""
-
-        while True:
-            if not self.engine.has_unfinished_requests():
-                # Poll until there is work to do.
-                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                    # When there's no work, check on engine health and send
-                    # health status back to client
-                    self._health_check()
-                    self.engine.do_log_stats()
-                    logger.debug("Waiting for new requests in engine loop.")
-
-            # Handle any input from the client.
-            self.handle_new_input()
-
-            # Engine step.
-            request_outputs = self.engine_step()
-
-            # Send request outputs (if async, done in engine_step callback).
-            if not self.use_async_sockets:
-                self._send_outputs(request_outputs)
-
-    def engine_step(self) -> List[RequestOutput]:
-        """Engine step wrapper with error handling."""
-        try:
-            return self.engine.step()
-        except SystemExit:
-            raise
-        except InputProcessingError as e:
-            # Special case where we handle an error preparing the inputs for
-            # a single request in the batch
-            rpc_err = RPCError(request_id=e.request_id,
-                               is_engine_errored=False,
-                               exception=e.__cause__)
-            self._send_outputs(rpc_err)
-            return []
-        except BaseException as e:
-            self._set_errored(e)
-            rpc_err = RPCError(request_id=None,
-                               is_engine_errored=True,
-                               exception=e)
-            self._send_outputs(rpc_err)
-            raise e
-
-    def handle_new_input(self):
-        """Handle new input from the socket"""
-        try:
-            while self.input_socket.poll(timeout=0) != 0:
-                frames = self.input_socket.recv_multipart(copy=False)
-                request = pickle.loads(frames[0].buffer)
-
-                if isinstance(request, RPCProcessRequest):
-                    if len(frames) > 1:
-                        # Use cloudpickle for logits processors
-                        assert isinstance(request.params, SamplingParams)
-                        lprocs = cloudpickle.loads(frames[1].buffer)
-                        request.params.logits_processors = lprocs
-                    self._handle_process_request(request)
-                elif isinstance(request, RPCAbortRequest):
-                    self._handle_abort_request(request)
-                elif isinstance(request, RPCUProfileRequest):
-                    if request == RPCUProfileRequest.START_PROFILE:
-                        self.start_profile()
-                    else:
-                        self.stop_profile()
-                elif isinstance(request, RPCLoadAdapterRequest):
-                    self._handle_load_adapter_request(request)
-                elif isinstance(request, RPCResetMultiModalCacheRequest):
-                    self.reset_mm_cache()
-                elif isinstance(request, RPCResetPrefixCacheRequest):
-                    self.reset_prefix_cache()
-                elif isinstance(request, RPCSleepRequest):
-                    self.sleep(request.value)
-                elif isinstance(request, RPCWakeUpRequest):
-                    self.wake_up(request.tags)
-                elif isinstance(request, RPCIsSleepingRequest):
-                    self._handle_is_sleeping_request(request)
-                else:
-                    raise ValueError("Unknown RPCRequest Type: "
-                                     f"{type(request)}")
-
-        except Exception as e:
-            self._set_errored(e)
-            self._send_unhealthy(e)
-            raise e from None
-
-    def _handle_process_request(self, request: RPCProcessRequest):
-        """Handle RPCProcessRequest by adding it to the LLMEngine."""
-        request_id = request.request_id
-
-        if self._errored_with is not None:
-            rpc_err = RPCError(request_id=request_id,
-                               is_engine_errored=True,
-                               exception=ENGINE_DEAD_ERROR(self._errored_with))
-            self._send_outputs(rpc_err)
-
-        try:
-            self.engine.add_request(request_id=request_id,
-                                    prompt=request.prompt,
-                                    params=request.params,
-                                    lora_request=request.lora_request,
-                                    trace_headers=request.trace_headers,
-                                    priority=request.priority)
-
-            if self.log_requests:
-                logger.info("Added request %s.", request.request_id)
-
-        except Exception as e:
-            # We do not set self._errored = True here, since the error
-            # is due to an issue adding this request to the engine,
-            # rather than an issue with the engine itself.
-            logger.debug("Failed to add request %s to engine. %s",
-                         request.request_id, e)
-            is_errored = self._errored_with is not None
-            rpc_err = RPCError(request_id=request_id,
-                               is_engine_errored=is_errored,
-                               exception=e)
-            self._send_outputs(rpc_err)
-
-            # Remove request from the engine.
-            self.engine.abort_request(request_id)
-
-    def _handle_abort_request(self, request: RPCAbortRequest):
-        self.engine.abort_request(request.request_id)
-        if self.log_requests:
-            logger.info("Aborted request %s.", request.request_id)
-
-    def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
-        try:
-            self.engine.add_lora(request.lora_request)
-        except BaseException as e:
-            # Send back an error if the adater fails to load
-            rpc_err = RPCError(request_id=request.request_id,
-                               is_engine_errored=False,
-                               exception=e)
-            self._send_outputs(rpc_err)
-            return
-        # Otherwise, send back the successful load message
-        self._send_outputs(
-            RPCAdapterLoadedResponse(request_id=request.request_id))
-
-    def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
-        is_sleeping = self.is_sleeping()
-        self._send_outputs(
-            RPCIsSleepingResponse(request_id=request.request_id,
-                                  is_sleeping=is_sleeping))
-
-    def _health_check(self):
-        # Send unhealthy if engine has already errored
-        if self._errored_with is not None:
-            self._send_unhealthy(self._errored_with)
-        try:
-            self.engine.check_health()
-            self._send_healthy()
-        except Exception as e:
-            self._set_errored(e)
-            self._send_unhealthy(e)
-
-    def _send_outputs(self, outputs: REQUEST_OUTPUTS_T):
-        """Send outputs back to the engine client. These can be:
-        - Exceptions
-        - A list of generation outputs
-        - A response from loading a lora adapter
-        """
-        if outputs:
-            try:
-                from ray.exceptions import RayTaskError
-
-                # RayTaskError might not pickelable here. We need to unpack the
-                # underlying exception as the real exception in the output.
-                if (isinstance(outputs, RPCError)
-                        and isinstance(outputs.exception, RayTaskError)):
-                    outputs.exception = outputs.exception.cause
-            except ImportError:
-                pass
-
-            output_bytes = pickle.dumps(outputs)
-            self.output_socket.send_multipart((output_bytes, ), copy=False)
-
-    def _send_healthy(self):
-        """Send HEALTHY message to RPCClient."""
-        if not self.heartbeat_socket.closed:
-            self.heartbeat_socket.send_multipart(HEALTHY_RESPONSE, copy=False)
-
-    def _send_unhealthy(self, error: BaseException):
-        """Send UNHEALTHY message to RPCClient."""
-        if not self.heartbeat_socket.closed:
-            error_bytes = pickle.dumps(error)
-            self.heartbeat_socket.send_multipart((error_bytes, ), copy=False)
-
-    def _async_socket_engine_callback(self,
-                                      request_outputs: REQUEST_OUTPUTS_T):
-        """Callback used by engine to make socket handling async with GPU."""
-        self._send_outputs(request_outputs)
-        self.handle_new_input()
-
-    def _set_errored(self, e: BaseException):
-        """Log and set errored status if this is the first issue."""
-        if self._errored_with is None:
-            self._errored_with = e
-
-    def start_profile(self) -> None:
-        self.engine.start_profile()
-
-    def stop_profile(self) -> None:
-        self.engine.stop_profile()
-
-    def reset_mm_cache(self) -> bool:
-        return self.engine.reset_mm_cache()
-
-    def reset_prefix_cache(self) -> bool:
-        return self.engine.reset_prefix_cache()
-
-    def sleep(self, level: int = 1) -> None:
-        self.engine.sleep(level)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        self.engine.wake_up(tags)
-
-    def is_sleeping(self) -> bool:
-        return self.engine.is_sleeping()
-
-
-def signal_handler(*_) -> None:
-    raise KeyboardInterrupt("MQLLMEngine terminated")
-
-
-def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
-                  ipc_path: str, disable_log_stats: bool,
-                  disable_log_requests: bool, engine_alive):
-    try:
-        # Ensure we can serialize transformer config before spawning
-        maybe_register_config_serialize_by_value()
-
-        engine = MQLLMEngine.from_vllm_config(
-            vllm_config=vllm_config,
-            usage_context=usage_context,
-            disable_log_stats=disable_log_stats,
-            disable_log_requests=disable_log_requests,
-            ipc_path=ipc_path)
-
-        signal.signal(signal.SIGTERM, signal_handler)
-
-        engine.start()
-
-    except BaseException as e:
-        logger.exception(e)
-        engine_alive.value = False
-        raise e from None
diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
deleted file mode 100644
index 19c5963d32db..000000000000
--- a/vllm/engine/output_processor/interfaces.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from abc import ABC, abstractmethod
-from typing import Callable, List
-
-from vllm.config import SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import Counter
-
-
-class SequenceGroupOutputProcessor(ABC):
-    """Interface for logic that processes new token ids in sequence groups,
-    managing detokenization, stop checking, and freeing/forking sequences with
-    the scheduler.
-
-    This is highly coupled with the LLMEngine and should be seen as an extension
-    of it. The logic is separated to simplify the LLMEngine class and allow
-    separate implementations for single-step decoding (which supports beam
-    search sequence forking) and multi-step decoding (which does not support
-    beam search, but does support speculative decoding).
-    """
-
-    @staticmethod
-    def create_output_processor(
-        scheduler_config: SchedulerConfig,
-        detokenizer: Detokenizer,
-        scheduler: List[Scheduler],
-        seq_counter: Counter,
-        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
-        stop_checker: "StopChecker",
-    ):
-        """Create an output processor.
-
-        This returns a single-step output processor if num_lookahead_slots is
-        zero, else returns a multi-step output processor.
-        """
-        if scheduler_config.num_lookahead_slots == 0:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.single_step import (
-                SingleStepOutputProcessor)
-            return SingleStepOutputProcessor(scheduler_config, detokenizer,
-                                             scheduler, seq_counter,
-                                             stop_checker)
-        else:
-            # Importing here to avoid cycle.
-            from vllm.engine.output_processor.multi_step import (
-                MultiStepOutputProcessor)
-            return MultiStepOutputProcessor(
-                detokenizer,
-                scheduler,
-                seq_counter,
-                get_tokenizer_for_seq,
-                stop_checker,
-            )
-
-    @abstractmethod
-    def process_outputs(self, sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool) -> None:
-        """Process new token ids for the sequence group. Handles logic such as
-        detokenization, stop checking, and freeing/forking sequences in the
-        scheduler.
-        """
-        pass
-
-    @abstractmethod
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Update prompt logprobs received from outputs to seq_group."""
-        pass
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
deleted file mode 100644
index 8b66ef0dc765..000000000000
--- a/vllm/engine/output_processor/multi_step.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import functools
-from typing import Callable, List, cast
-
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.single_step import (
-    single_step_process_prompt_logprob)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Sequence,
-                           SequenceGroup, SequenceGroupOutput, SequenceOutput,
-                           SequenceStatus)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import Counter
-
-logger = init_logger(__name__)
-
-
-class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
-    """SequenceGroupOutputProcessor which handles logic related to
-    detokenization and stopping conditions. It specializes to "multi-step
-    decoding", where vLLM's worker may generate multiple tokens per invocation.
-    This is currently mutually exclusive with advanced sampling techniques like
-    beam search, which motivates the separation of this logic from the single
-    step output processor.
-
-    This class is responsible for things such as correctly appending all new
-    token ids to their sequence, detokenizing new token ids, truncating new
-    output tokens after an eos token, and correctly handling the case where the
-    number of new output tokens per sequence differs in a single batch.
-    """
-
-    def __init__(
-        self,
-        detokenizer: Detokenizer,
-        scheduler: List[Scheduler],
-        seq_counter: Counter,
-        get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer],
-        stop_checker: StopChecker,
-    ):
-        self.detokenizer = detokenizer
-        self.scheduler = scheduler
-        self.seq_counter = seq_counter
-        self.get_tokenizer_for_seq = get_tokenizer_for_seq
-        self.stop_checker = stop_checker
-
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Process prompt logprobs associated with each step of a multi-step-
-        scheduled computation.
-
-        Args:
-          seq_group: the outputs are associated with this
-              [`SequenceGroup`][vllm.sequence.SequenceGroup]
-          outputs: the
-              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
-              for all scheduler steps
-        """
-        for output in outputs:
-            # Concatenate single-step prompt logprob processing results.
-            assert isinstance(output, CompletionSequenceGroupOutput)
-            single_step_process_prompt_logprob(self, seq_group, output)
-
-    @staticmethod
-    @functools.lru_cache
-    def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        logger.warning(
-            "Prompt logprob is not supported by multi step workers. "
-            "(e.g., speculative decode uses multi step workers).")
-
-    def process_outputs(self,
-                        sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool = False) -> None:
-        """Append new tokens in the outputs to sequences in the sequence group.
-
-        This only supports sequence groups of size 1. It supports greater than
-        one new token per sequence.
-
-        This applies logic like stop condition checking and detokenization.
-        It also handles cases where there are tokens emitted after 
-        the EOS token.
-
-        is_async - Indicates whether this postprocessor runs in 
-            parallel with the GPU forward pass and is processing 
-            tokens from the previous step. If this is true, then
-            no tokens need to be appended since it is already done
-            externally (before the next schedule() call)
-        """
-        # Sequences can be in RUNNING or FINISHED_ABORTED state
-        # once scheduled, as a sequence is moved to FINISHED_ABORTED
-        # if a client disconnects from the api server.
-        seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
-        if seqs is None:
-            seqs = sequence_group.get_seqs(
-                status=SequenceStatus.FINISHED_ABORTED)
-
-        assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
-        assert len(seqs) == 1, (
-            "Beam search not supported in multi-step decoding.")
-        seq = seqs[0]
-        seq_id = seq.seq_id
-        # This method is defined in the more generic
-        # SequenceGroupOutputProcessor, but here we assume that the outputs are
-        # of a more specific type.
-        assert all([
-            isinstance(output, CompletionSequenceGroupOutput)
-            for output in outputs
-        ])
-        compl_outputs = cast(List[CompletionSequenceGroupOutput], outputs)
-        assert all([
-            seq_id == output.samples[0].parent_seq_id
-            for output in compl_outputs
-        ])
-
-        if is_async:
-            # Async case: We process tokens one by one. Here, we know the token
-            # was already appended, so we only need to do the rest of the
-            # postprocessor: Detokenization + stopping logic
-            self._process_decode_and_stop(seq, sequence_group.sampling_params)
-        else:
-            # Standard multi-step case
-
-            # Since there's only one sequence per sequence group,
-            # we can take the first sample.
-            samples = [output.samples[0] for output in compl_outputs]
-
-            # entries in sample tokens may be invalid (eg. due to spec decode
-            # rejecting tokens).
-            valid_samples = [
-                sample for sample in samples
-                if sample.output_token != VLLM_INVALID_TOKEN_ID
-            ]
-
-            # When both spec-decode and pre-fill chunking are enabled, we
-            # don't have guaranteed samples here (e.g. all -1s).
-            if valid_samples:
-                self._process_seq_outputs(seq, valid_samples,
-                                          sequence_group.sampling_params)
-
-    def _process_decode_and_stop(self, seq: Sequence,
-                                 sampling_params: SamplingParams) -> None:
-        new_char_count = 0
-        if sampling_params.detokenize and self.detokenizer:
-            new_char_count = self.detokenizer.decode_sequence_inplace(
-                seq, sampling_params)
-
-        # TODO(sang): Support lora.
-        self.stop_checker.maybe_stop_sequence(
-            seq,
-            new_char_count=new_char_count,
-            sampling_params=sampling_params,
-        )
-
-    def _process_seq_outputs(self, seq: Sequence,
-                             valid_samples: List[SequenceOutput],
-                             sampling_params: SamplingParams) -> None:
-        output_token_ids = [sample.output_token for sample in valid_samples]
-        output_logprobs = [sample.logprobs for sample in valid_samples]
-        output_embeds = [sample.output_embed for sample in valid_samples]
-
-        # Truncate to max_tokens if necessary.
-        remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
-                                                         len(output_token_ids))
-        if remaining_tokens < 0:
-            output_token_ids = output_token_ids[:remaining_tokens]
-
-        # Truncate any tokens after EOS. This is required as spec decode
-        # generates a fixed number of tokens without evaluating stopping
-        # conditions within the block. This can cause an eos token to be
-        # unintentionally ignored.
-        if not sampling_params.ignore_eos and self.detokenizer:
-            eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
-            # Avoiding .index calls as exception throwing in the happy path
-            # is expensive.
-            for i in range(len(output_token_ids)):
-                if output_token_ids[i] == eos_token_id:
-                    output_token_ids = output_token_ids[:i + 1]
-                    break
-
-        is_prefill_sampled_token = seq.data.get_num_uncomputed_tokens() == 0
-        # Incrementally append tokens to the sequence, as if we had only one new
-        # token.
-        for output_token_id, output_logprob, output_embed in zip(
-                output_token_ids, output_logprobs, output_embeds):
-            seq.append_token_id(
-                token_id=output_token_id,
-                logprobs=output_logprob,
-                token_embed=output_embed,
-            )
-
-            if is_prefill_sampled_token:
-                is_prefill_sampled_token = False
-            else:
-                # Update num_computed_tokens iff the sampled token is not from
-                # a prefill step.
-                seq.data.update_num_computed_tokens(1)
-
-            self._process_decode_and_stop(seq, sampling_params)
-
-            if seq.is_finished():
-                break
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
deleted file mode 100644
index dbf6a371d050..000000000000
--- a/vllm/engine/output_processor/single_step.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List
-
-from vllm.config import SchedulerConfig
-from vllm.core.scheduler import Scheduler
-from vllm.engine.output_processor.interfaces import (
-    SequenceGroupOutputProcessor)
-from vllm.engine.output_processor.stop_checker import StopChecker
-from vllm.logger import init_logger
-from vllm.sequence import (CompletionSequenceGroupOutput, SequenceGroup,
-                           SequenceGroupOutput)
-from vllm.transformers_utils.detokenizer import Detokenizer
-from vllm.utils import Counter
-
-logger = init_logger(__name__)
-
-
-def single_step_process_prompt_logprob(
-        sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
-        output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the
-    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
-
-    Do nothing if the output has no prompt logprobs.
-
-    Account for the fact that transformers do not compute first-token logprobs.
-    
-    Args:
-      sg_output_proc:
-          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
-          instance
-      seq_group: the output is associated with this
-          [`SequenceGroup`][vllm.sequence.SequenceGroup]
-      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
-          for a single scheduler step
-    """
-    prompt_logprobs = output.prompt_logprobs
-
-    # If this is the first (or only) "chunk" of the prefill, we need
-    # to prepend None to the list of prompt logprobs. The reason for this
-    # is that for N prompt tokens, the Sampler will generate N-1 total
-    # prompt logprobs during prefill since the token at idx 0 will not
-    # have a logprob associated with it.
-    if prompt_logprobs is not None:
-        if not seq_group.prompt_logprobs:
-            prompt_logprobs = [None] + prompt_logprobs
-            seq_group.prompt_logprobs = []
-
-        assert hasattr(sg_output_proc, 'detokenizer')
-        if (seq_group.sampling_params.detokenize
-                and sg_output_proc.detokenizer):
-            sg_output_proc.detokenizer.decode_prompt_logprobs_inplace(
-                seq_group,
-                prompt_logprobs,
-                position_offset=len(seq_group.prompt_logprobs))
-
-        seq_group.prompt_logprobs.extend(prompt_logprobs)
-
-
-class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
-    """SequenceGroupOutputProcessor which handles "output processing" logic,
-    which happens after the model returns generated token ids and before
-    scheduling of the next batch. Output processing logic includes
-    detokenization, and determining if a sequence is finished (e.g. via max len
-    or eos token).
-
-    The SingleStepOutputProcessor is specialized to the case where the model
-    emits at most a single token per invocation, which precludes configurations
-    such as speculative decoding or multi-step decoding. This enables beam
-    search sampling, which requires forking/finishing/freeing sequences in a way
-    that is currently difficult to schedule multiple steps ahead of time.
-    """
-
-    def __init__(self, scheduler_config: SchedulerConfig,
-                 detokenizer: Detokenizer, scheduler: List[Scheduler],
-                 seq_counter: Counter, stop_checker: StopChecker):
-        self.scheduler_config = scheduler_config
-        self.detokenizer = detokenizer
-        self.scheduler = scheduler
-        self.seq_counter = seq_counter
-        self.stop_checker = stop_checker
-
-    def process_outputs(self, sequence_group: SequenceGroup,
-                        outputs: List[SequenceGroupOutput],
-                        is_async: bool) -> None:
-        """Append all new tokens to sequences in the sequence group. Fork any
-        surviving beam candidates; free any unsurviving ones.
-
-        Invokes detokenizer to detokenize new tokens, and also marks sequences
-        as finished if they meet stop conditions.
-        
-        is_async - Indicates whether this postprocessor runs in 
-            parallel with the GPU forward pass and is processing 
-            tokens from the previous step. If this is true, then
-            no tokens need to be appended since it is already done
-            externally (before the next schedule() call)
-        """
-        assert (len(outputs) == 1
-                ), f"{type(self)} does not support multiple outputs per step"
-        return self._process_sequence_group_outputs(sequence_group, outputs[0],
-                                                    is_async)
-
-    def process_prompt_logprob(self, seq_group: SequenceGroup,
-                               outputs: List[SequenceGroupOutput]) -> None:
-        """Process prompt logprobs associated with one step of a single-step-
-        scheduled computation.
-        
-        Args:
-          seq_group: the output is associated with this
-              [`SequenceGroup`][vllm.sequence.SequenceGroup]
-          outputs: the
-              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
-              for a single scheduler step
-        """
-        assert len(outputs) == 1, "Single step should only have 1 output."
-        output = outputs[0]
-        assert isinstance(output, CompletionSequenceGroupOutput)
-        single_step_process_prompt_logprob(self, seq_group, output)
-
-    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
-                                        outputs: SequenceGroupOutput,
-                                        is_async: bool) -> None:
-        sampling_params = seq_group.sampling_params
-
-        sample = outputs.samples[0]
-        seq = seq_group.first_seq
-        if not is_async:
-            seq.append_token_id(sample.output_token, sample.logprobs,
-                                sample.output_embed)
-        if sampling_params.detokenize and self.detokenizer:
-            new_char_count = self.detokenizer.decode_sequence_inplace(
-                seq, sampling_params)
-        else:
-            new_char_count = 0
-        self.stop_checker.maybe_stop_sequence(
-            seq,
-            new_char_count,
-            sampling_params,
-            lora_req=seq_group.lora_request,
-        )
-        if seq.is_finished():
-            for scheduler in self.scheduler:
-                scheduler.free_seq(seq)
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
deleted file mode 100644
index 3fb2f71b5e99..000000000000
--- a/vllm/engine/output_processor/stop_checker.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Callable, List, Optional, Tuple
-
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import Sequence, SequenceStatus
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-
-
-class StopChecker:
-    """LLMEngine helper class which separates out the logic involving stop
-    checking. This checks things such as: whether the eos token was emitted,
-    whether the max_tokens has been consumed, whether a stop string has been
-    emitted, or if we have exceeded the max model len.
-    """
-
-    def __init__(self, max_model_len: int,
-                 get_tokenizer_for_seq: Callable[[Sequence], AnyTokenizer]):
-        # Do not use it directly, but use `self._get_max_model_len`.
-        self._max_model_len = max_model_len
-        self.get_tokenizer_for_seq = get_tokenizer_for_seq
-
-    def _get_max_model_len(self, lora_req: Optional[LoRARequest]):
-        if lora_req and lora_req.long_lora_max_len:
-            return lora_req.long_lora_max_len
-        else:
-            return self._max_model_len
-
-    def maybe_stop_sequence(
-        self,
-        seq: Sequence,
-        new_char_count: int,
-        sampling_params: SamplingParams,
-        lora_req: Optional[LoRARequest] = None,
-    ) -> None:
-        """Stop the finished sequences.
-
-       new_char_count is the number of chars added to the
-           sequence's output text for the newly generated token
-        """
-
-        # Check if the minimum number of tokens has been generated yet;
-        # skip the stop string/token checks if not
-        if seq.get_output_len() < sampling_params.min_tokens:
-            return
-
-        # Check if the sequence has generated the EOS token.
-        if ((not sampling_params.ignore_eos)
-                and seq.get_last_token_id() == seq.eos_token_id):
-            # Remove the last EOS token unless explicitly specified
-            # This prevents unintended exposure of the EOS token
-            if new_char_count and (
-                    not sampling_params.include_stop_str_in_output):
-                seq.output_text = seq.output_text[:-new_char_count]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            return
-
-        # Check if a stop token was encountered.
-        # This assumes a single token produced per step.
-        last_token_id = seq.get_last_token_id()
-        if last_token_id in (sampling_params.stop_token_ids or ()):
-            if new_char_count and (
-                    not sampling_params.include_stop_str_in_output):
-                # Remove last token
-                seq.output_text = seq.output_text[:-new_char_count]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = last_token_id
-            return
-
-        # Check if any stop strings are matched.
-        stop = self.check_stop_strings(
-            seq.output_text, new_char_count, sampling_params.stop,
-            sampling_params.include_stop_str_in_output)
-        if stop is not None:
-            stop_str, truncate_to = stop
-            if truncate_to != -1:
-                seq.output_text = seq.output_text[:truncate_to]
-            seq.status = SequenceStatus.FINISHED_STOPPED
-            seq.stop_reason = stop_str
-            return
-
-        # Check if the sequence has reached max_model_len.
-        if seq.get_len() >= self._get_max_model_len(lora_req):
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-        # Check if the sequence has reached max_tokens.
-        if seq.get_output_len() == sampling_params.max_tokens:
-            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
-            return
-
-    @staticmethod
-    def check_stop_strings(
-        output_text: str,
-        new_char_count: int,
-        stop: List[str],
-        include_in_output: bool,
-    ) -> Optional[Tuple[str, int]]:
-        """Check if any stop strings are matched and truncate sequence
-        output text accordingly.
-
-        Returns tuple (stop_string, offset) if matched or else None.
-
-        Where stop_string is the matched stop string and offset is the
-        length to which output_text should be truncated, or -1 for no
-        truncation.
-        """
-        if not new_char_count or not stop:
-            return None
-
-        for stop_str in stop:
-            stop_string_len = len(stop_str)
-            # Avoid searching already-searched text.
-            stop_index = output_text.find(stop_str,
-                                          1 - new_char_count - stop_string_len)
-            if stop_index == -1:
-                continue
-
-            if include_in_output:
-                # Truncate to end of stop string.
-                stop_index += stop_string_len
-                if stop_index >= len(output_text):
-                    # No truncation required.
-                    return stop_str, -1
-
-            # Truncate the output text to either the beginning
-            # or end of the stop string.
-            return stop_str, stop_index
-        return None
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
deleted file mode 100644
index 1e127eb98242..000000000000
--- a/vllm/engine/output_processor/util.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List
-from typing import Sequence as GenericSequence
-from typing import cast
-
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import CompletionSequenceGroupOutput, SequenceGroupOutput
-
-
-def create_output_by_sequence_group(
-        outputs: GenericSequence[SamplerOutput],
-        num_seq_groups: int) -> List[List[SequenceGroupOutput]]:
-    """Helper method which transforms a 2d list organized by
-    [step][sequence group] into [sequence group][step].
-    """
-    output_by_sequence_group: List[List[CompletionSequenceGroupOutput]] = [
-        [] for _ in range(num_seq_groups)
-    ]
-    for step in outputs:
-        sequence_group_output: CompletionSequenceGroupOutput
-        for i, sequence_group_output in enumerate(step):
-            output_by_sequence_group[i].append(sequence_group_output)
-
-    # Cast to the more generic type that CompletionSequenceGroupOutput
-    # inherits from.
-    return cast(List[List[SequenceGroupOutput]], output_by_sequence_group)
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 671e9648a3d0..326332653c1c 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -7,7 +7,6 @@
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
-from vllm.core.scheduler import SchedulerOutputs
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import is_explicit_encoder_decoder_prompt
 from vllm.inputs.preprocess import InputPreprocessor
@@ -272,7 +271,7 @@ async def is_tracing_enabled(self) -> bool:
     @abstractmethod
     async def do_log_stats(
         self,
-        scheduler_outputs: Optional[SchedulerOutputs] = None,
+        scheduler_outputs=None,
         model_output: Optional[list[SamplerOutput]] = None,
     ) -> None:
         ...
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
deleted file mode 100644
index 4e8c6d79095f..000000000000
--- a/vllm/executor/mp_distributed_executor.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import os
-from typing import Any, Callable, List, Optional, Union
-
-import cloudpickle
-
-from vllm.executor.executor_base import DistributedExecutorBase
-from vllm.executor.multiproc_worker_utils import (
-    ProcessWorkerWrapper, ResultHandler, WorkerMonitor,
-    set_multiprocessing_worker_envs)
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        make_async, run_method, update_environment_variables)
-from vllm.worker.worker_base import WorkerWrapperBase
-
-logger = init_logger(__name__)
-
-
-class MultiprocessingDistributedExecutor(DistributedExecutorBase):
-    """Python multiprocessing-based distributed executor"""
-
-    uses_ray: bool = False
-
-    def _check_cuda(self) -> None:
-        """Check that the number of GPUs is sufficient for the parallel
-        configuration. Separate from _init_executor to reduce the number of
-        indented blocks.
-        """
-        parallel_config = self.parallel_config
-        world_size = parallel_config.world_size
-        tensor_parallel_size = parallel_config.tensor_parallel_size
-
-        cuda_device_count = cuda_device_count_stateless()
-        # Use confusing message for more common TP-only case.
-        if tensor_parallel_size > cuda_device_count:
-            raise RuntimeError(
-                f"please set tensor_parallel_size ({tensor_parallel_size}) "
-                f"to less than max local gpu count ({cuda_device_count})")
-
-        if world_size > cuda_device_count:
-            raise RuntimeError(
-                f"please ensure that world_size ({world_size}) "
-                f"is less than than max local gpu count ({cuda_device_count})")
-
-        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
-        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            update_environment_variables({
-                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
-            })
-
-    def _init_executor(self) -> None:
-
-        from vllm.platforms import current_platform
-        if current_platform.is_cuda_alike():
-            self._check_cuda()
-
-        # Create the parallel GPU workers.
-        world_size = self.parallel_config.world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-
-        # Set multiprocessing envs that are common to V0 and V1
-        set_multiprocessing_worker_envs(self.parallel_config)
-
-        # Multiprocessing-based executor does not support multi-node setting.
-        # Since it only works for single node, we can use the loopback address
-        # 127.0.0.1 for communication.
-        distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
-
-        self.workers: List[ProcessWorkerWrapper] = []
-        # This is the list of workers that are rank 0 of each TP group EXCEPT
-        # global rank 0. These are the workers that will broadcast to the
-        # rest of the workers.
-        self.tp_driver_workers: List[ProcessWorkerWrapper] = []
-        # This is the list of workers that are not drivers and not the first
-        # worker in a TP group. These are the workers that will be
-        # broadcasted to.
-        self.non_driver_workers: List[ProcessWorkerWrapper] = []
-
-        if world_size == 1:
-            self.worker_monitor = None
-        else:
-            result_handler = ResultHandler()
-            for rank in range(1, world_size):
-                worker = ProcessWorkerWrapper(result_handler,
-                                              WorkerWrapperBase,
-                                              self.vllm_config, rank)
-                self.workers.append(worker)
-                if rank % tensor_parallel_size == 0:
-                    self.tp_driver_workers.append(worker)
-                else:
-                    self.non_driver_workers.append(worker)
-
-            self.worker_monitor = WorkerMonitor(self.workers, result_handler)
-            result_handler.start()
-            self.worker_monitor.start()
-
-        # Set up signal handlers to shutdown the executor cleanly
-        # sometimes gc does not work well
-
-        self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
-
-        all_kwargs = []
-        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
-        for i in range(world_size):
-            local_rank = i
-            rank = i
-            kwargs = dict(
-                vllm_config=self.vllm_config,
-                local_rank=local_rank,
-                rank=rank,
-                distributed_init_method=distributed_init_method,
-                is_driver_worker=(not self.parallel_config)
-                or (rank % self.parallel_config.tensor_parallel_size == 0),
-            )
-            all_kwargs.append(kwargs)
-        self._run_workers("init_worker", all_kwargs)
-        self._run_workers("init_device")
-        self._run_workers("load_model",
-                          max_concurrent_workers=self.parallel_config.
-                          max_parallel_loading_workers)
-        self.driver_exec_model = make_async(self.driver_worker.execute_model)
-        self.pp_locks: Optional[List[asyncio.Lock]] = None
-
-    def shutdown(self):
-        if (worker_monitor := getattr(self, "worker_monitor",
-                                      None)) is not None:
-            worker_monitor.close()
-
-    def _driver_execute_model(
-        self, execute_model_req: Optional[ExecuteModelRequest]
-    ) -> Optional[List[SamplerOutput]]:
-        """Run execute_model in the driver worker.
-
-        Passing None will cause the driver to stop the model execution
-        loop running in each of the remote workers.
-        """
-        return self.driver_worker.execute_model(execute_model_req)
-
-    def _run_workers(
-        self,
-        method: Union[str, Callable],
-        *args,
-        async_run_tensor_parallel_workers_only: bool = False,
-        max_concurrent_workers: Optional[int] = None,
-        **kwargs,
-    ) -> List[Any]:
-        """Runs the given method on all workers.
-
-        Args:
-            async_run_tensor_parallel_workers_only: If True the method will be
-                run only in the remote TP workers, not the driver worker.
-                It will also be run asynchronously and return a list of futures
-                rather than blocking on the results.
-        """
-        if isinstance(method, str):
-            sent_method = method
-        else:
-            sent_method = cloudpickle.dumps(method)
-        del method
-
-        if max_concurrent_workers:
-            raise NotImplementedError(
-                "max_concurrent_workers is not supported yet.")
-
-        if async_run_tensor_parallel_workers_only:
-            # Run only non-driver workers and just return futures.
-            return [
-                worker.execute_method(sent_method, *args, **kwargs)
-                for worker in self.non_driver_workers
-            ]
-
-        # Start all remote workers first.
-        worker_outputs = [
-            worker.execute_method(sent_method, *args, **kwargs)
-            for worker in self.workers
-        ]
-
-        driver_worker_output = run_method(self.driver_worker, sent_method,
-                                          args, kwargs)
-
-        # Get the results of the workers.
-        return [driver_worker_output
-                ] + [output.get() for output in worker_outputs]
-
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        if self.worker_monitor is not None and not self.worker_monitor.is_alive(
-        ):
-            raise RuntimeError("Worker processes are not running")
-
-    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
-        """Wait for futures returned from _run_workers() with
-        async_run_remote_workers_only to complete."""
-        for result in parallel_worker_tasks:
-            result.get()
-
-    async def _driver_execute_model_async(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> List[SamplerOutput]:
-        if not self.tp_driver_workers:
-            return await self.driver_exec_model(execute_model_req)
-
-        if self.pp_locks is None:
-            # This locks each pipeline parallel stage so multiple virtual
-            # engines can't execute on the same stage at the same time
-            # We create the locks here to avoid creating them in the constructor
-            # which uses a different asyncio loop.
-            self.pp_locks = [
-                asyncio.Lock()
-                for _ in range(self.parallel_config.pipeline_parallel_size)
-            ]
-
-        tasks = [
-            asyncio.create_task(
-                _run_task_with_lock(self.driver_exec_model, self.pp_locks[0],
-                                    execute_model_req))
-        ]
-        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
-                                                start=1):
-            tasks.append(
-                asyncio.create_task(
-                    _run_task_with_lock(driver_worker.execute_method_async,
-                                        self.pp_locks[pp_rank],
-                                        "execute_model", execute_model_req)))
-        results = await asyncio.gather(*tasks)
-
-        # Only the last PP stage has the final results.
-        return results[-1]
-
-    async def _start_worker_execution_loop(self):
-        coros = [
-            worker.execute_method_async("start_worker_execution_loop")
-            for worker in self.non_driver_workers
-        ]
-        return await asyncio.gather(*coros)
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
deleted file mode 100644
index e93be9bfb165..000000000000
--- a/vllm/model_executor/layers/logits_processor.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A layer that compute logits from hidden_stats."""
-import inspect
-from concurrent.futures import ThreadPoolExecutor
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-import vllm.envs as envs
-from vllm.distributed import (tensor_model_parallel_all_gather,
-                              tensor_model_parallel_gather)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    VocabParallelEmbedding)
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
-
-_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
-if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
-    _logits_processor_threadpool = ThreadPoolExecutor(
-        envs.VLLM_LOGITS_PROCESSOR_THREADS)
-
-
-class LogitsProcessor(nn.Module):
-    """Process logits and apply logits processors from sampling metadata.
-
-    This layer does the following:
-    1. Gather logits from model hidden_states.
-    2. Scale logits if needed.
-    3. Apply logits processors (if any).
-    """
-
-    def __init__(self,
-                 vocab_size: int,
-                 org_vocab_size: Optional[int] = None,
-                 scale: float = 1.0,
-                 logits_as_input: bool = False,
-                 soft_cap: Optional[float] = None) -> None:
-        """
-        Args:
-            scale: A scaling factor to apply to the logits.
-        """
-        super().__init__()
-        self.scale = scale
-        self.vocab_size = vocab_size
-        # Whether the input is logits (default is hidden states).
-        self.logits_as_input = logits_as_input
-        # original vocabulary size (without LoRA).
-        self.org_vocab_size = org_vocab_size or vocab_size
-        # Soft cap the logits. Used in Gemma 2.
-        self.soft_cap = soft_cap
-        # Whether to use gather or all-gather to gather the logits.
-        self.use_all_gather = current_platform.use_all_gather()
-
-    def forward(
-        self,
-        lm_head: VocabParallelEmbedding,
-        hidden_states: torch.Tensor,
-        sampling_metadata: Optional[SamplingMetadata] = None,
-        embedding_bias: Optional[torch.Tensor] = None,
-        prune_hidden_states: bool = True,
-    ) -> Optional[torch.Tensor]:
-        if self.logits_as_input:
-            logits = hidden_states
-        else:
-            if sampling_metadata is not None and prune_hidden_states:
-                hidden_states = _prune_hidden_states(hidden_states,
-                                                     sampling_metadata)
-
-            # Get the logits for the next tokens.
-            logits = self._get_logits(hidden_states, lm_head, embedding_bias)
-        if logits is not None:
-            if self.soft_cap is not None:
-                logits = logits / self.soft_cap
-                logits = torch.tanh(logits)
-                logits = logits * self.soft_cap
-
-            if self.scale != 1.0:
-                logits *= self.scale
-
-            # Apply logits processors (if any).
-            if sampling_metadata is not None and \
-                sampling_metadata.seq_groups is not None:
-                logits = _apply_logits_processors(logits, sampling_metadata)
-
-        return logits
-
-    def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor:
-        """gather/all-gather the logits tensor across model parallel group."""
-        if self.use_all_gather:
-            # Gather is not supported for some devices such as TPUs.
-            # Use all-gather instead.
-            # NOTE(woosuk): Here, the outputs of every device should not be None
-            # because XLA requires strict SPMD among all devices. Every device
-            # should execute the same operations after gathering the logits.
-            logits = tensor_model_parallel_all_gather(logits)
-        else:
-            # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
-        return logits
-
-    def _get_logits(
-        self,
-        hidden_states: torch.Tensor,
-        lm_head: VocabParallelEmbedding,
-        embedding_bias: Optional[torch.Tensor],
-    ) -> Optional[torch.Tensor]:
-        # Get the logits for the next tokens.
-        logits = lm_head.quant_method.apply(lm_head,
-                                            hidden_states,
-                                            bias=embedding_bias)
-
-        # Gather logits for TP
-        logits = self._gather_logits(logits)
-
-        # Remove paddings in vocab (if any).
-        if logits is not None:
-            logits = logits[..., :self.org_vocab_size]
-        return logits
-
-    def extra_repr(self) -> str:
-        s = f"vocab_size={self.vocab_size}"
-        s += f", org_vocab_size={self.org_vocab_size}"
-        s += f", scale={self.scale}, logits_as_input={self.logits_as_input}"
-        return s
-
-
-def _prune_hidden_states(
-    hidden_states: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
-    # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
-    # (warmup, profile_run) we might not have selected_token_indices,
-    # so we skip pruning.
-    if sampling_metadata.selected_token_indices is not None:
-        return hidden_states.index_select(
-            0, sampling_metadata.selected_token_indices)
-    else:
-        return hidden_states
-
-
-def _apply_logits_processors(
-    logits: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
-    found_logits_processors = False
-    logits_processed = 0
-    logits_row_ids_and_logits_row_futures = []
-    for seq_group in sampling_metadata.seq_groups:
-        seq_ids = seq_group.seq_ids
-        sampling_params = seq_group.sampling_params
-        logits_processors = sampling_params.logits_processors
-        if logits_processors:
-            found_logits_processors = True
-
-            for seq_id, logits_row_idx in zip(seq_ids,
-                                              seq_group.sample_indices):
-                logits_row = logits[logits_row_idx]
-                past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
-                prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
-
-                if _logits_processor_threadpool is not None:
-                    logits_row_ids_and_logits_row_futures.append(
-                        (logits_row_idx,
-                         _logits_processor_threadpool.submit(
-                             _apply_logits_processors_single_seq, logits_row,
-                             logits_processors, past_tokens_ids,
-                             prompt_tokens_ids)))
-                else:
-                    logits[logits_row_idx] = \
-                        _apply_logits_processors_single_seq(
-                            logits_row, logits_processors, past_tokens_ids,
-                            prompt_tokens_ids)
-
-        logits_processed += len(seq_group.sample_indices) + len(
-            seq_group.prompt_logprob_indices)
-
-    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
-        logits[logits_row_idx] = future.result()
-
-    if found_logits_processors:
-        # verifies that no rows in logits were missed unexpectedly
-        assert logits_processed == logits.shape[0]
-    return logits
-
-
-def _apply_logits_processors_single_seq(logits_row, logits_processors,
-                                        past_tokens_ids,
-                                        prompt_tokens_ids) -> torch.Tensor:
-    for logits_processor in logits_processors:
-        parameters = inspect.signature(logits_processor).parameters
-        if len(parameters) == 3:
-            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
-                                          logits_row)
-        else:
-            logits_row = logits_processor(past_tokens_ids, logits_row)
-    return logits_row
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
deleted file mode 100644
index e77eb637c894..000000000000
--- a/vllm/model_executor/layers/sampler.py
+++ /dev/null
@@ -1,1198 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A layer that samples the next tokens from the model's outputs."""
-import itertools
-from collections.abc import Iterator
-from dataclasses import dataclass
-from importlib.util import find_spec
-from math import inf
-from typing import Optional, Union
-
-import msgspec
-import torch
-import torch.nn as nn
-
-import vllm.envs as envs
-from vllm.model_executor.layers.utils import apply_penalties
-from vllm.model_executor.sampling_metadata import (SamplingMetadata,
-                                                   SamplingTensors,
-                                                   SequenceGroupToSample)
-from vllm.sampling_params import SamplingType
-from vllm.sequence import (VLLM_INVALID_TOKEN_ID,
-                           CompletionSequenceGroupOutput, Logprob,
-                           PromptLogprobs, SampleLogprobs, SequenceOutput)
-
-if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
-    # yapf: disable
-    from flashinfer.sampling import (
-        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
-
-    # yapf: enable
-else:
-    flashinfer_top_k_top_p_sampling = None
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def get_sampler() -> torch.nn.Module:
-    if envs.VLLM_USE_V1:
-        # Lazy import: the v1 package isn't distributed
-        from vllm.v1.sample.sampler import Sampler as V1Sampler
-        return V1Sampler()
-    return Sampler()
-
-
-# (num_token_ids, num_parent_ids) per sequence group.
-SampleResultType = list[tuple[list[int], list[int]]]
-
-# Types of temporary data structures used for
-# computing sample_result
-SampleMetadataType = dict[SamplingType, tuple[list[int],
-                                              list[SequenceGroupToSample]]]
-MultinomialSamplesType = dict[SamplingType, torch.Tensor]
-SampleResultsDictType = dict[int, tuple[list[int], list[int]]]
-
-
-# Encapsulates temporary data structures for computing
-# sample_result.
-#
-# * For multi-step scheduling: must be returned
-#   by `Sampler.forward()` and used later to compute the pythonized
-#   sample_result
-#
-# * For single-step scheduling: consumed immediately
-#   inside `Sampler.forward()` to compute pythonized sample_result.
-@dataclass
-class SampleResultArgsType:
-    sample_metadata: SampleMetadataType
-    multinomial_samples: MultinomialSamplesType
-    sample_results_dict: SampleResultsDictType
-    sampling_metadata: SamplingMetadata
-    greedy_samples: Optional[torch.Tensor]
-
-
-# Union of non-deferred (single-step scheduling)
-# vs deferred (multi-step scheduling)
-# sample result types
-MaybeDeferredSampleResultType = Union[SampleResultType, SampleResultArgsType]
-
-# Abbreviation of the _sample() return type
-SampleReturnType = tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]]
-
-
-class SamplerOutput(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
-    """For each sequence group, we generate a list of SequenceOutput object,
-    each of which contains one possible candidate for the next token.
-
-    This data structure implements methods, so it can be used like a list, but
-    also has optional fields for device tensors.
-    """
-
-    outputs: list[CompletionSequenceGroupOutput]
-
-    # On-device tensor containing probabilities of each token.
-    sampled_token_probs: Optional[torch.Tensor] = None
-
-    # On-device tensor containing the logprobs of each token.
-    logprobs: Optional["torch.Tensor"] = None
-
-    # Holds either (1) the pythonized sampler result (single-step scheduling)
-    # or (2) what will be arguments for later deferred pythonization of the
-    # sampler result (muliti-step scheduling)
-    deferred_sample_results_args: Optional[SampleResultArgsType] = None
-
-    # On-device tensor containing the sampled token ids.
-    sampled_token_ids: Optional[torch.Tensor] = None
-    # CPU tensor containing the sampled token ids. Used during multi-step to
-    # return the sampled token ids from last rank to AsyncLLMEngine to be
-    # 'broadcasted' to all other PP ranks for next step.
-    sampled_token_ids_cpu: Optional[torch.Tensor] = None
-
-    # On-device tensor containing the sampled token embeddings (embeddings
-    # corresponding to the sampled token ids). Used when prompt embeddings are
-    # specified in lieu of prompt token ids or text.
-    sampled_token_embeds: Optional[torch.Tensor] = None
-
-    # Optional last hidden states from the model.
-    hidden_states: Optional[torch.Tensor] = None
-
-    # Optional prefill hidden states from the model
-    # (used for models like EAGLE).
-    prefill_hidden_states: Optional[torch.Tensor] = None
-
-    # Time taken in the forward pass for this across all workers
-    model_forward_time: Optional[float] = None
-
-    # Time taken in the model execute function. This will include model forward,
-    # block/sync across workers, cpu-gpu sync time and sampling time.
-    model_execute_time: Optional[float] = None
-
-    def __getitem__(self, idx: int) -> CompletionSequenceGroupOutput:
-        return self.outputs[idx]
-
-    def __setitem__(self, idx: int, value):
-        self.outputs[idx] = value
-
-    def __iter__(self) -> Iterator[CompletionSequenceGroupOutput]:
-        return iter(self.outputs)
-
-    def __len__(self):
-        return len(self.outputs)
-
-    def __eq__(self, other: object):
-        return isinstance(other,
-                          self.__class__) and self.outputs == other.outputs
-
-    def __repr__(self) -> str:
-        """Show the shape of a tensor instead of its values to reduce noise.
-        """
-        sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
-                                    else self.sampled_token_probs.shape)
-        sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
-                                  self.sampled_token_ids.shape)
-        return (f"SamplerOutput(outputs={self.outputs}, "
-                f"sampled_token_probs={sampled_token_probs_repr}, "
-                f"sampled_token_ids={sampled_token_ids_repr})")
-
-
-class Sampler(nn.Module):
-    """Samples the next tokens from the model's outputs.
-
-    This layer does the following:
-    1. Discard the hidden states that are not used for sampling (i.e., all
-        tokens except the final one in each prompt).
-    2. Compute the logits for the next tokens.
-    3. Apply presence, frequency and repetition penalties.
-    4. Apply temperature scaling.
-    5. Apply top-p and top-k truncation.
-    6. Sample the next tokens.
-    Here, each sequence group within the batch can have different sampling
-    parameters (e.g., sampling method, temperature, top-p, top-k, etc.).
-
-    The structure of the logits tensor is coupled with the seq_groups in
-    sampling_metadata. Typically, each sequence in each seq_group has one row in
-    logits for the next token to be sampled; however, for a seq_group with a
-    prompt request with the prompt_logprobs sampling parameter, there are rows
-    in logits for each token in the input prompt.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-        # Whether or not the SamplerOutput should have on-device tensors
-        # containing the sampled token ids and probabilities. This is used by
-        # speculative decoding and when prompt embeddings are specified.
-        self.include_gpu_probs_tensor = False
-        self.should_modify_greedy_probs_inplace = False
-
-    def _init_sampling_tensors(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ):
-        """The goal here is to reuse sampling tensors between similar decode
-        runs. This is possible because sampling logic does not change between
-        decodes of the same sequences.
-        """
-        _, vocab_size = logits.shape
-
-        # First free any existing stored sampling tensors.
-        # This is necessary because some sampling tensors may
-        # have pinned memory.
-        self._sampling_tensors = None
-
-        # Initialize new sampling tensors
-        (sampling_tensors, do_penalties, do_top_p_top_k,
-         do_min_p) = SamplingTensors.from_sampling_metadata(
-             sampling_metadata, vocab_size, logits.device, logits.dtype)
-
-        self._sampling_tensors = sampling_tensors
-        self._do_penalties = do_penalties
-        self._do_top_p_top_k = do_top_p_top_k
-        self._do_min_p = do_min_p
-
-    def forward(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        """
-        Single-step scheduling:
-            * Perform GPU-side sampling computation & compute
-            GPU-side logprobs tensor
-            * Pythonize sampling result & logprobs tensor
-
-        Multi-step scheduling:
-            * Perform GPU-side sampling computation & compute
-            GPU-side logprobs tensor
-            * Defer Pythonization of sampling result & logprobs
-            tensor
-            * Encapsulate arguments required for deferred Pythonization
-            in the
-            [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
-            structure
-
-        Args:
-            logits: (num_tokens, vocab_size).
-            sampling_metadata: Metadata for sampling.
-        """
-        assert logits is not None
-        _, vocab_size = logits.shape
-
-        # Prepare sampling tensors with pinned memory to avoid blocking.
-        if not sampling_metadata.reuse_sampling_tensors:
-            self._init_sampling_tensors(logits, sampling_metadata)
-        elif self._do_penalties:
-            # In this case, the sampling tensors logic depends on
-            # "output_tokens" of a sequence. As a result, we cannot
-            # reuse sampling tensors, since "output_tokens" changes
-            # between decode runs.
-            self._init_sampling_tensors(logits, sampling_metadata)
-
-        assert self._sampling_tensors is not None
-        sampling_tensors = self._sampling_tensors
-        do_penalties = self._do_penalties
-        do_top_p_top_k = self._do_top_p_top_k
-        do_min_p = self._do_min_p
-
-        logits = _apply_min_tokens_penalty(logits, sampling_metadata)
-
-        # Apply presence and frequency penalties.
-        if do_penalties:
-            logits = apply_penalties(logits, sampling_tensors.prompt_tokens,
-                                     sampling_tensors.output_tokens,
-                                     sampling_tensors.presence_penalties,
-                                     sampling_tensors.frequency_penalties,
-                                     sampling_tensors.repetition_penalties)
-
-        # Use float32 to apply temperature scaling.
-        # Use in-place division to avoid creating a new tensor.
-        logits = logits.to(torch.float)
-        logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
-
-        if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
-            logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
-                                        sampling_tensors.top_ks)
-
-        if do_min_p:
-            logits = _apply_min_p(logits, sampling_tensors.min_ps)
-
-        # We use float32 for probabilities and log probabilities.
-        # Compute the probabilities.
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
-        # Compute the log probabilities.
-        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
-
-        # Sample the next tokens.
-        maybe_deferred_sample_results, maybe_sampled_tokens_tensor = _sample(
-            probs,
-            logprobs,
-            sampling_metadata,
-            sampling_tensors,
-            include_gpu_probs_tensor=self.include_gpu_probs_tensor,
-            modify_greedy_probs=self._should_modify_greedy_probs_inplace,
-        )
-
-        if self.include_gpu_probs_tensor:
-            # Since we will defer sampler result Pythonization,
-            # preserve GPU-side tensors in support of later
-            # deferred pythonization of logprobs
-            assert maybe_sampled_tokens_tensor is not None
-            on_device_tensors = (probs, logprobs, maybe_sampled_tokens_tensor)
-        else:
-            # Since Pythonization has already happened, don't preserve
-            # GPU-side tensors.
-            on_device_tensors = None
-
-        # Get the logprobs query results.
-        prompt_logprobs = None
-        sample_logprobs = None
-        if not sampling_metadata.skip_sampler_cpu_output:
-            # Pythonize logprobs now (GPU -> CPU); do not defer.
-            assert not isinstance(maybe_deferred_sample_results,
-                                  SampleResultArgsType)
-            prompt_logprobs, sample_logprobs = get_logprobs(
-                logprobs, sampling_metadata, maybe_deferred_sample_results)
-
-        return _build_sampler_output(
-            maybe_deferred_sample_results,
-            sampling_metadata,
-            prompt_logprobs,
-            sample_logprobs,
-            on_device_tensors=on_device_tensors,
-            skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output)
-
-    @property
-    def _should_modify_greedy_probs_inplace(self) -> bool:
-        """Whether or not the sampler should modify the probability distribution
-        of greedily-sampled tokens such that multinomial sampling would sample
-        the greedily-sampled token.
-
-        In other words, if True then we set the probability of the greedily-
-        sampled token to 1.
-
-        This is used by speculative decoding, which requires that the sampling
-        method be encoded into the probability distribution.
-        """
-        return self.should_modify_greedy_probs_inplace
-
-
-def _apply_min_tokens_penalty(
-    logits: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-) -> torch.Tensor:
-    """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens
-        have not been generated yet
-    """
-    # list of indices in logits that will be set to -inf
-    logits_to_penalize: list[tuple[int, int]] = []
-    logits_applied = 0
-    for seq_group in sampling_metadata.seq_groups:
-        seq_ids = seq_group.seq_ids
-        sampling_params = seq_group.sampling_params
-
-        sample_indices = seq_group.sample_indices
-        logits_applied += len(sample_indices) + len(
-            seq_group.prompt_logprob_indices)
-        if not seq_group.do_sample:
-            continue
-
-        start_idx = sample_indices[0]
-        min_tokens = sampling_params.min_tokens
-        token_ids_to_penalize = sampling_params.all_stop_token_ids
-        if min_tokens > 0 and token_ids_to_penalize:
-            seqs_to_penalize: list[int] = []
-            for j, seq_id in enumerate(seq_ids):
-                seq_data = seq_group.seq_data[seq_id]
-                if len(seq_data.output_token_ids_array) < min_tokens:
-                    seqs_to_penalize.append(j)
-
-            if seqs_to_penalize:
-                # convert to the index into logits
-                seqs_to_penalize = [start_idx + j for j in seqs_to_penalize]
-                # itertools.product pairs each seq index with every token id
-                logits_to_penalize.extend(
-                    itertools.product(seqs_to_penalize, token_ids_to_penalize))
-
-    if logits_to_penalize:
-        # use zip and * to group indices along each dimension
-        # eg. [ (1,2), (1,3), (5,6) ] -> ( (1,1,5), (2,3,6) )
-        logits[tuple(zip(*logits_to_penalize))] = -float("inf")
-
-    # verifies that no rows in logits were missed unexpectedly
-    assert logits_applied == logits.shape[0]
-    return logits
-
-
-def _apply_top_k_top_p(
-    logits: torch.Tensor,
-    p: torch.Tensor,
-    k: torch.Tensor,
-) -> torch.Tensor:
-    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
-
-    # Apply top-k.
-    top_k_mask = logits_sort.size(1) - k.to(torch.long)
-    # Get all the top_k values.
-    top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
-    top_k_mask = logits_sort < top_k_mask
-    logits_sort.masked_fill_(top_k_mask, -float("inf"))
-
-    # Apply top-p.
-    probs_sort = logits_sort.softmax(dim=-1)
-    probs_sum = probs_sort.cumsum(dim=-1)
-    top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
-    # at least one
-    top_p_mask[:, -1] = False
-    logits_sort.masked_fill_(top_p_mask, -float("inf"))
-
-    # Re-sort the probabilities.
-    logits = torch.empty_like(logits_sort).scatter_(dim=-1,
-                                                    index=logits_idx,
-                                                    src=logits_sort)
-    return logits
-
-
-def _apply_min_p(
-    logits: torch.Tensor,
-    min_p: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Adapted from
-    https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17
-    """
-    probs = torch.softmax(logits, dim=-1)
-    top_probs, _ = probs.max(dim=-1, keepdim=True)
-    scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs
-    tokens_to_remove = probs < scaled_min_p
-    logits = logits.masked_fill_(tokens_to_remove, -float("inf"))
-
-    return logits
-
-
-def _greedy_sample(
-    selected_seq_groups: list[SequenceGroupToSample],
-    samples: torch.Tensor,
-) -> SampleResultType:
-    """Run greedy sampling on a given samples.
-
-    Args:
-        selected_seq_groups: A list of sequence groups batched.
-        samples: (num_selected_samples,) A tensor of samples. The length of
-            samples could be smaller than selected_seq_groups if
-            seq_group.do_sample is False.
-    Returns:
-        Tuple of (next_token_ids, parent_ids). The length of returned list is
-        same as the length of selected_seq_groups. If the corresponding
-        seq_group has do_sample=False, tuple contains ([], [])
-    """
-    samples_lst = samples.tolist()
-    sample_idx = 0
-    results: SampleResultType = []
-    for seq_group in selected_seq_groups:
-        if not seq_group.do_sample:
-            results.append(([], []))
-            continue
-
-        seq_ids = seq_group.seq_ids
-        num_parent_seqs = len(seq_ids)
-        assert num_parent_seqs == 1, (
-            "Greedy sampling should have only one seq.")
-        parent_ids = list(range(num_parent_seqs))
-        next_token_ids = [samples_lst[sample_idx]]
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    return results
-
-
-def _random_sample(
-    selected_seq_groups: list[SequenceGroupToSample],
-    random_samples: torch.Tensor,
-) -> SampleResultType:
-    """Run random sampling on a given samples.
-
-    Args:
-        selected_seq_groups: A list of sequence groups batched.
-        random_samples: (num_selected_samples,) A tensor of samples. The
-            length of samples could be smaller than selected_seq_groups if
-            seq_group.do_sample is False.
-    Returns:
-        Tuple of (next_token_ids, parent_ids). The length of returned list is
-        same as the length of selected_seq_groups. If the corresponding
-        seq_group has do_sample=False, tuple contains ([], [])
-    """
-    # Find the maximum n value of the prompt phase requests.
-    random_samples = random_samples.cpu()
-    sample_idx = 0
-    results: SampleResultType = []
-    for seq_group in selected_seq_groups:
-        if not seq_group.do_sample:
-            results.append(([], []))
-            continue
-
-        seq_ids = seq_group.seq_ids
-        sampling_params = seq_group.sampling_params
-        is_prompt = seq_group.is_prompt
-        num_parent_seqs = len(seq_ids)
-        if is_prompt:
-            # Prompt phase.
-            parent_ids = [0] * sampling_params.n
-            next_token_ids = random_samples[
-                sample_idx, :sampling_params.n].tolist()
-        else:
-            # Generation phase.
-            parent_ids = list(range(num_parent_seqs))
-            next_token_ids = random_samples[sample_idx:sample_idx +
-                                            num_parent_seqs, 0].tolist()
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    return results
-
-
-# torch.multinomial forces a GPU<->CPU sync.
-# Therefore, we use an optimized implementation instead.
-# Note that we always sample with replacement.
-# probs will be modified in place, but this is fine, as we pass
-# in a copy already.
-def _multinomial(
-    probs: torch.Tensor,
-    num_samples: int,
-    seq_groups: Optional[list[SequenceGroupToSample]] = None,
-) -> torch.Tensor:
-    if num_samples > 1:
-        probs = probs.repeat_interleave(num_samples, dim=0)
-    q = torch.empty_like(probs)
-    if seq_groups is None:
-        q.exponential_()
-    else:
-        sample_idx = 0
-        for seq_group in seq_groups:
-            seq_ids = seq_group.seq_ids
-            stride = len(seq_ids) * num_samples
-            assert seq_group.generator is not None
-            q[sample_idx:sample_idx +
-              stride].exponential_(generator=seq_group.generator)
-            sample_idx += stride
-    return probs.div_(q).argmax(dim=1).view(-1, num_samples)
-
-
-def _top_k_top_p_multinomial_with_flashinfer(
-        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
-        num_samples: int, seq_groups: Optional[list[SequenceGroupToSample]]):
-    if num_samples > 1:
-        probs = probs.repeat_interleave(num_samples, dim=0)
-        top_ks = top_ks.repeat_interleave(num_samples)
-        top_ps = top_ps.repeat_interleave(num_samples)
-    batch_next_token_ids = flashinfer_top_k_top_p_sampling(
-        probs,
-        top_ks,
-        top_ps,
-    )
-    return batch_next_token_ids.view(-1, num_samples)
-
-
-def get_pythonized_sample_results(
-        sample_result_args: SampleResultArgsType) -> SampleResultType:
-    '''This function consumes GPU-side sampler results and computes
-    Pythonized CPU-side sampler results (GPU -> CPU sync.)
-
-    Single-step scheduling: this function is invoked at sampling-time
-    for immediate Pythonization.
-
-    Multi-step scheduling: Pythonization is deferred until after multiple
-    GPU-side steps have been completed.
-
-    Args:
-      sample_result_args: GPU-side inputs to the Pythonization process
-
-    Returns:
-      Pythonized sampler results
-    '''
-
-    (
-        sample_metadata,
-        sampling_metadata,
-        greedy_samples,
-        multinomial_samples,
-        sample_results_dict,
-    ) = (
-        sample_result_args.sample_metadata,
-        sample_result_args.sampling_metadata,
-        sample_result_args.greedy_samples,
-        sample_result_args.multinomial_samples,
-        sample_result_args.sample_results_dict,
-    )
-
-    for sampling_type in SamplingType:
-        if sampling_type not in sample_metadata:
-            continue
-        (seq_group_id, seq_groups) = sample_metadata[sampling_type]
-        if sampling_type == SamplingType.GREEDY:
-            sample_results = _greedy_sample(seq_groups, greedy_samples)
-        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
-            sample_results = _random_sample(seq_groups,
-                                            multinomial_samples[sampling_type])
-        sample_results_dict.update(zip(seq_group_id, sample_results))
-
-    return [
-        sample_results_dict.get(i, ([], []))
-        for i in range(len(sampling_metadata.seq_groups))
-    ]
-
-
-def _sample_with_torch(
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-    sampling_tensors: SamplingTensors,
-    include_gpu_probs_tensor: bool,
-    modify_greedy_probs: bool,
-) -> SampleReturnType:
-    '''Torch-oriented _sample() implementation.
-
-    Single-step scheduling:
-    * Perform GPU-side sampling computation
-    * Immediately Pythonize sampling result
-
-    Multi-step scheduling:
-    * Perform GPU-side sampling computation
-    * Defer Pythonization & preserve GPU-side
-      tensors required for Pythonization
-    '''
-
-    categorized_seq_group_ids: dict[SamplingType, list[int]] = {
-        t: []
-        for t in SamplingType
-    }
-    categorized_sample_indices = sampling_metadata.categorized_sample_indices
-    for i, seq_group in enumerate(sampling_metadata.seq_groups):
-        sampling_params = seq_group.sampling_params
-        sampling_type = sampling_params.sampling_type
-        categorized_seq_group_ids[sampling_type].append(i)
-
-    sample_results_dict: SampleResultsDictType = {}
-    sample_metadata: SampleMetadataType = {}
-    multinomial_samples: MultinomialSamplesType = {}
-    greedy_samples: Optional[torch.Tensor] = None
-
-    # Create output tensor for sampled token ids.
-    if include_gpu_probs_tensor:
-        sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1),
-                                              VLLM_INVALID_TOKEN_ID,
-                                              dtype=torch.long,
-                                              device=logprobs.device)
-    else:
-        sampled_token_ids_tensor = None
-
-    # Counterintiutively, having two loops here is actually faster.
-    # The first loop can run without waiting on GPU<->CPU sync.
-    for sampling_type in SamplingType:
-        sample_indices = categorized_sample_indices[sampling_type]
-        num_tokens = len(sample_indices)
-        if num_tokens == 0:
-            continue
-
-        seq_group_id = categorized_seq_group_ids[sampling_type]
-        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_id]
-        sample_metadata[sampling_type] = (seq_group_id, seq_groups)
-        long_sample_indices = sample_indices.long()
-        if sampling_type == SamplingType.GREEDY:
-            greedy_samples = torch.argmax(logprobs[long_sample_indices],
-                                          dim=-1)
-
-            if sampled_token_ids_tensor is not None:
-                # Store sampled tokens in output tensor.
-                sampled_token_ids_tensor[
-                    long_sample_indices] = greedy_samples.unsqueeze(-1)
-
-            if modify_greedy_probs:
-                # If required, modify the probabilities such that sampling from
-                # the modified distribution would always sample the argmax
-                # token id.
-                _modify_greedy_probs_inplace(logprobs, probs,
-                                             long_sample_indices,
-                                             greedy_samples)
-
-        elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
-            max_n_in_batch = 1
-            for seq_group in seq_groups:
-                if seq_group.is_prompt:
-                    sampling_params = seq_group.sampling_params
-                    max_n_in_batch = max(max_n_in_batch, sampling_params.n)
-            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
-                              seq_groups)
-
-            if flashinfer_top_k_top_p_sampling is not None:
-                logger.warning("FlashInfer 0.2.3+ does not support "
-                               "per-request generators. Falling back to "
-                               "PyTorch-native implementation.")
-
-            multinomial_samples[sampling_type] = _multinomial(
-                probs[long_sample_indices],
-                max_n_in_batch,
-                seq_groups=seq_groups_arg)
-
-            if sampled_token_ids_tensor is not None:
-                # Store sampled tokens in output tensor.
-                sampled_token_ids_tensor[long_sample_indices] = \
-                    multinomial_samples[sampling_type].to(torch.long)
-
-        else:
-            raise ValueError(f"Unsupported sampling type: {sampling_type}")
-
-    # Encapsulate arguments for computing Pythonized sampler
-    # results, whether deferred or otherwise.
-    maybe_deferred_args = SampleResultArgsType(
-        sampling_metadata=sampling_metadata,
-        sample_metadata=sample_metadata,
-        multinomial_samples=multinomial_samples,
-        greedy_samples=greedy_samples,
-        sample_results_dict=sample_results_dict)
-
-    if not sampling_metadata.skip_sampler_cpu_output:
-        # GPU<->CPU sync happens here.
-        # This also converts the sampler output to a Python object.
-        # Return Pythonized sampler result & sampled token ids
-        return get_pythonized_sample_results(
-            maybe_deferred_args), sampled_token_ids_tensor
-    else:
-        # Defer sampler result Pythonization; return deferred
-        # Pythonization args & sampled token ids
-        return (
-            maybe_deferred_args,
-            sampled_token_ids_tensor,
-        )
-
-
-def _sample(
-    probs: torch.Tensor,
-    logprobs: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-    sampling_tensors: SamplingTensors,
-    include_gpu_probs_tensor: bool,
-    modify_greedy_probs: bool,
-) -> SampleReturnType:
-    """
-    Args:
-        probs: (num_query_tokens_in_batch, num_vocab)
-        logprobs: (num_query_tokens_in_batch, num_vocab)
-        sampling_metadata: The metadata for a batch for sampling.
-        sampling_tensors: Tensors that include sampling related metadata.
-
-    Returns:
-        (next_token_ids, parent_seq_ids) for each seq group in a batch.
-            If sampling is skipped, it returns ([], [])
-        sampled_token_ids_tensor: A tensor of sampled token ids.
-    """
-    return _sample_with_torch(
-        probs,
-        logprobs,
-        sampling_metadata,
-        sampling_tensors,
-        include_gpu_probs_tensor=include_gpu_probs_tensor,
-        modify_greedy_probs=modify_greedy_probs,
-    )
-
-
-def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
-    """
-    This function calculates the ranks of the chosen tokens in a logprob tensor.
-
-    Args:
-        x (torch.Tensor): 2D logprob tensor of shape (N, M)
-                        where N is the no. of tokens and M is the vocab dim.
-        indices (torch.Tensor): List of chosen token indices.
-
-    Returns:
-        torch.Tensor: 1D tensor of shape (N,) where N is the no. of tokens.
-                    Each element in the returned tensor represents the rank
-                    of the chosen token in the input logprob tensor.
-    """
-    vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype),
-             indices]
-    result = (x > vals[:, None])
-    del vals
-    return result.sum(1).add_(1)
-
-
-def get_logprobs(
-    logprobs: torch.Tensor,
-    sampling_metadata: SamplingMetadata,
-    sample_results: SampleResultType,
-) -> tuple[list[Optional[PromptLogprobs]], list[SampleLogprobs]]:
-    """Return sample logprobs and prompt logprobs.
-
-    The logic consists of 3 parts.
-    - Select indices to compute logprob from, ranks of token ids, and
-        the top k token ids from logprobs.
-    - Compute prompt logprobs if required.
-    - Compute sample logprobs if required.
-
-    Args:
-        logprobs: (num_query_tokens_across_batch, num_vocab). Each query token's
-            logprob per vocab. Sequence groups' query tokens are batched in a
-            single flattened tensor. For example, assuming there are N
-            seq groups, it is sorted by prefill tokens for seq_group_1 (if
-            prompt logprob is enabled), decode tokens for seq_group_1 (if
-            sampling is required), prefill tokens for seq_group_2, ...
-        sampling_metadata: The sampling metadata.
-        sample_results: (num_seq_groups) The tuple of (next_token_ids,
-            parent_ids) for each sequence group. When beam search is enabled,
-            sample_results can contain different number of seq_ids from
-            sampling_metadata.seq_groups. It is because beam search creates
-            2 * BEAM_WIDTH number of samples (whereas there are only up to
-            BEAM_WIDTH number of seq_ids).
-
-    Returns:
-        A tuple of prompt and sample logprobs per sequence group in a batch.
-    """
-    # The index of query token to calculate logprobs. It includes both
-    # prompt and sample logprob indices.
-    query_indices: list[int] = []
-    # The next token ids to get the logprob value from.
-    next_token_ids: list[int] = []
-    # The largest requested number of logprobs. We find logprobs as many as the
-    # largest num logprobs in this API. If every logprobs is None, it will be
-    # set to -1.
-    largest_num_logprobs = -1
-
-    # Select indices to compute logprob from, ranks of token ids, and the top
-    # k token ids from logprobs.
-    for (seq_group, sample_result) in zip(sampling_metadata.seq_groups,
-                                          sample_results):
-        sampling_params = seq_group.sampling_params
-
-        # Update indices and tokens for prompt logprobs.
-        if (seq_group.is_prompt
-                and sampling_params.prompt_logprobs is not None):
-            largest_num_logprobs = max(largest_num_logprobs,
-                                       sampling_params.prompt_logprobs)
-            next_prompt_tokens = _get_next_prompt_tokens(seq_group)
-            query_indices.extend(seq_group.prompt_logprob_indices)
-            next_token_ids.extend(next_prompt_tokens)
-
-        # Update indices and next tokenes for sample logprob.
-        if seq_group.do_sample:
-            token_ids, parent_seq_ids = sample_result
-            # NOTE: We cannot directly use sample_indices because
-            # sample_indices only contain parent seq_ids of a previous step.
-            # The current step may have different number of seq_ids, and
-            # we can obtain it from `sample_result[1]`.
-            query_idx = seq_group.sample_indices[0]
-            query_indices.extend(
-                [query_idx + parent_id for parent_id in parent_seq_ids])
-            next_token_ids.extend(token_ids)
-
-            if sampling_params.logprobs is not None:
-                largest_num_logprobs = max(largest_num_logprobs,
-                                           sampling_params.logprobs)
-
-        assert len(next_token_ids) == len(query_indices)
-
-    if len(query_indices) == 0:
-        empty_sampled_logprob: SampleLogprobs = []
-        empty_prompt_logprob: Optional[PromptLogprobs] = None
-        num_seq_groups = len(sampling_metadata.seq_groups)
-        return [empty_prompt_logprob
-                ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups
-
-    selected_logprobs, ranks = None, None
-    top_logprobs, top_token_ids = None, None
-
-    # If largest_num_logprobs == -1, i.e. no logprobs are requested, we can
-    # skip the whole logprob calculation.
-    if largest_num_logprobs >= 0:
-        query_indices_gpu = torch.tensor(query_indices, device=logprobs.device)
-        next_token_ids_gpu = torch.tensor(next_token_ids,
-                                          device=logprobs.device)
-
-        # (num_selected_query_tokens, num_logprobs). Note that query_indices can
-        # contain duplicates if beam search is enabled.
-        selected_logprobs = logprobs[[
-            query_indices_gpu,
-            next_token_ids_gpu,
-        ]]
-        ranks = _get_ranks(
-            logprobs[query_indices_gpu],
-            next_token_ids_gpu,
-        )
-        assert selected_logprobs.shape[0] == ranks.shape[0]
-
-        # We need to compute top k only if there exists logprobs > 0.
-        if largest_num_logprobs > 0:
-            # Logprobs of topk tokens for a batch of sequence groups.
-            # (num_query_tokens_across_batch).
-            top_logprobs, top_token_ids = torch.topk(logprobs,
-                                                     largest_num_logprobs,
-                                                     dim=-1)
-            top_logprobs = top_logprobs.to('cpu')
-            top_token_ids = top_token_ids.to('cpu')
-
-        selected_logprobs = selected_logprobs.to('cpu')
-        ranks = ranks.to('cpu')
-
-    # Find prompt/sample logprobs.
-    prompt_logprobs_per_seq_group: list[Optional[PromptLogprobs]] = []
-    sample_logprobs_per_seq_group: list[SampleLogprobs] = []
-    top_logprob_idx = 0
-    selected_logprobs_idx = 0
-
-    for seq_group, sample_result in zip(sampling_metadata.seq_groups,
-                                        sample_results):
-        (prompt_logprobs, top_logprob_idx,
-         selected_logprobs_idx) = _get_prompt_logprob_if_needed(
-             seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs,
-             selected_logprobs_idx, top_logprob_idx)
-        prompt_logprobs_per_seq_group.append(prompt_logprobs)
-
-        (sampled_logprobs, top_logprob_idx,
-         selected_logprobs_idx) = _get_sampled_logprob_if_needed(
-             seq_group, sample_result, selected_logprobs, ranks, top_token_ids,
-             top_logprobs, selected_logprobs_idx, top_logprob_idx)
-        sample_logprobs_per_seq_group.append(sampled_logprobs)
-
-    return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group
-
-
-def _get_prompt_logprob_if_needed(
-    seq_group: SequenceGroupToSample,
-    selected_logprobs: torch.Tensor,
-    ranks: torch.Tensor,
-    top_token_ids: torch.Tensor,
-    top_logprobs: torch.Tensor,
-    selected_logprobs_idx: int,
-    top_logprob_idx: int,
-):
-    """Compute the prompt logprob from a sequence group if needed."""
-    sampling_params = seq_group.sampling_params
-    is_prompt = seq_group.is_prompt
-
-    # Find prompt logprobs
-    prompt_logprobs: Optional[PromptLogprobs] = None
-    if is_prompt and sampling_params.prompt_logprobs is not None:
-        prompt_logprobs = []
-        num_logprobs = sampling_params.prompt_logprobs
-        next_prompt_tokens = _get_next_prompt_tokens(seq_group)
-        # Pre-select indexes and create a list. It is faster than calling .item
-        # repetitively.
-        selected_logprob_items = selected_logprobs[
-            selected_logprobs_idx:selected_logprobs_idx +
-            len(next_prompt_tokens)].tolist()
-        rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
-                           len(next_prompt_tokens)].tolist()
-
-        for idx, token_id in enumerate(next_prompt_tokens):
-            # Calculate the prompt logprob of the real prompt tokens.
-            # {token_id: (logprob, rank_from_vocab)}
-            prompt_logprobs_dict: dict[int, tuple[float, int]] = {
-                token_id: (selected_logprob_items[idx], rank_items[idx])
-            }
-
-            # Add top K prompt logprobs along with its rank.
-            if num_logprobs > 0:
-                top_ids = top_token_ids[
-                    top_logprob_idx, :num_logprobs].tolist()
-                top_probs = top_logprobs[
-                    top_logprob_idx, :num_logprobs].tolist()
-                # Top K is already sorted by rank, so we can use 1 ~
-                # num_logprobs + 1 for rank.
-                top_ranks = range(1, num_logprobs + 1)
-                prompt_logprobs_dict.update({
-                    top_id: (top_prob, rank)
-                    for top_id, top_prob, rank in zip(top_ids, top_probs,
-                                                      top_ranks)
-                })
-            prompt_logprobs.append({
-                token_id: Logprob(*logprob_and_rank)
-                for token_id, logprob_and_rank in prompt_logprobs_dict.items()
-            })
-            # + 1 to go to the next prompt token.
-            top_logprob_idx += 1
-
-        # + len(next_prompt_tokens) to go to the next prompt.
-        selected_logprobs_idx += len(next_prompt_tokens)
-    return prompt_logprobs, top_logprob_idx, selected_logprobs_idx
-
-
-def _get_sampled_logprob_if_needed(
-    seq_group: SequenceGroupToSample,
-    sample_result: tuple[list[int], list[int]],
-    selected_logprobs: torch.Tensor,
-    ranks: torch.Tensor,
-    top_token_ids: torch.Tensor,
-    top_logprobs: torch.Tensor,
-    selected_logprobs_idx: int,
-    top_logprob_idx: int,
-):
-    """Compute the sample logprob if needed."""
-    seq_ids = seq_group.seq_ids
-    num_logprobs = seq_group.sampling_params.logprobs
-    sampled_logprobs: SampleLogprobs = []
-    next_token_ids, parent_seq_ids = sample_result
-
-    if seq_group.do_sample:
-        assert len(next_token_ids) > 0
-        if num_logprobs is None:
-            for next_token_id in next_token_ids:
-                # Use a dummy logprob
-                sampled_logprobs.append({next_token_id: Logprob(inf)})
-        else:
-            # Pre-select items from tensor. tolist() is faster than repetitive
-            # `.item()` calls.
-            selected_logprob_items = selected_logprobs[
-                selected_logprobs_idx:selected_logprobs_idx +
-                len(next_token_ids)].tolist()
-            rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx +
-                               len(next_token_ids)].tolist()
-            for idx, (next_token_id, parent_id) in enumerate(
-                    zip(next_token_ids, parent_seq_ids)):
-                # Get the logprob of a sampled token.
-                sampled_logprobs_dict = {
-                    next_token_id:
-                    (selected_logprob_items[idx], rank_items[idx])
-                }
-                if num_logprobs is not None and num_logprobs > 0:
-                    # Get top K logprobs.
-                    top_ids = top_token_ids[top_logprob_idx +
-                                            parent_id, :num_logprobs].tolist()
-                    top_probs = top_logprobs[
-                        top_logprob_idx + parent_id, :num_logprobs].tolist()
-                    # Top K is already sorted by rank, so we can use 1 ~
-                    # num_logprobs + 1 for rank.
-                    top_ranks = range(1, num_logprobs + 1)
-                    sampled_logprobs_dict.update({
-                        top_id: (top_prob, rank)
-                        for top_id, top_prob, rank in zip(
-                            top_ids, top_probs, top_ranks)
-                    })
-
-                sampled_logprobs.append({
-                    token_id: Logprob(*logprob_and_rank)
-                    for token_id, logprob_and_rank in
-                    sampled_logprobs_dict.items()
-                })
-
-        # NOTE: This part of code is not intuitive. `selected_logprobs` include
-        # logprobs for the current step, which has len(next_token_ids) tokens
-        # per sequence group. `logprobs` includes logprobs from the previous
-        # steps, which has len(seq_ids) tokens per sequence group.
-
-        # Iterate to the next sequence group in a batch.
-        selected_logprobs_idx += len(next_token_ids)
-        # Iterate to the next sequence group in a batch.
-        top_logprob_idx += len(seq_ids)
-    return sampled_logprobs, top_logprob_idx, selected_logprobs_idx
-
-
-def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
-                                 sample_indices: torch.Tensor,
-                                 greedy_samples: torch.Tensor) -> None:
-    """Modify the probability distributions of the greedily-sampled tokens such
-    that each sampled token has a "probability" of 1.0. This is required by
-    speculative decoding, which depends on the sampling method being encoded
-    within the probability distribution for correctness.
-
-    # Why do we only need to do this for greedy sampling?
-
-    vLLM's sampler performs the following steps for greedy or multinomial
-    (random) sampling:
-        1. Get logits from model.
-        2. Modify logits according to per-sequence sampling parameters.
-            - Multiply by temperature, top-k and top-p masking, penalize tokens
-                according to their frequency, etc.
-        3. Sample a token.
-            - Random sampling simply samples from the modified probability
-                distribution.
-            - Greedy sampling performs `argmax` to obtain the token with the
-                highest likelihood.
-
-    Ignoring greedy sampling for a moment, we find that the computed probability
-    distribution has the following property: we can sample from it independently
-    and find that the token sampled by the Sampler has a frequency corresponding
-    to how often we see it in our sampling. In other words, for tokens sampled
-    with vLLM's random SamplingType, the computed probability distribution
-    encodes the sampling methodology completely.
-
-    Greedy sampling does not normally have this property. vLLM modifies logits
-    according to sampling params, then performs `argmax`, then returns the
-    sampled token and the computed probability distribution. If we sample from
-    the distribution, we'll find the likelihood of the greedily-sampled token
-    is not always 1.0.
-
-    Since lossless speculative decoding requires that the sampling methodology
-    be encoded within the probability distribution, we are motivated to modify
-    the probability distribution such that the sampled token has probability 1
-    when speculative decoding is used.
-
-    NOTE: Alternatively, we could use an extremely low temperature to achieve
-    greedy sampling using multinomial computation and unite the codepaths. This
-    has implications on the overall design of the sampler, e.g. how to record
-    accurate logprobs for the user, so this improvement is deferred to later.
-    """
-    # NOTE: logprobs are not modified so they can be returned to the user.
-    probs[sample_indices, :] = 0
-    probs[sample_indices, greedy_samples] = 1.0
-
-
-def _build_sampler_output(
-    maybe_deferred_sample_results: MaybeDeferredSampleResultType,
-    sampling_metadata: SamplingMetadata,
-    prompt_logprobs: Optional[list[Optional[PromptLogprobs]]],
-    sample_logprobs: Optional[list[SampleLogprobs]],
-    on_device_tensors: Optional[tuple[torch.Tensor, torch.Tensor,
-                                      torch.Tensor]],
-    skip_sampler_cpu_output: bool = False,
-) -> SamplerOutput:
-    """Construct Python objects with the output of sampling.
-
-    Args:
-        on_device_tensors: Tuple containing on-device tensors with the
-            probabilities used in sampling and the sampled token ids. This
-            allows post-processing without copies to CPU/serialization, e.g. in
-            speculative decoding rejection sampling.
-    """
-    sampler_output: list[CompletionSequenceGroupOutput] = []
-
-    if skip_sampler_cpu_output:
-        assert isinstance(maybe_deferred_sample_results, SampleResultArgsType)
-        deferred_sample_results_args = maybe_deferred_sample_results
-    else:
-        assert prompt_logprobs is not None
-        assert sample_logprobs is not None
-        assert not isinstance(maybe_deferred_sample_results,
-                              SampleResultArgsType)
-        assert len(sampling_metadata.seq_groups) \
-            == len(maybe_deferred_sample_results) \
-            == len(prompt_logprobs) \
-            == len(sample_logprobs)
-        deferred_sample_results_args = None
-
-        for (seq_group, sample_result, group_prompt_logprobs,
-             group_sample_logprobs) in zip(sampling_metadata.seq_groups,
-                                           maybe_deferred_sample_results,
-                                           prompt_logprobs, sample_logprobs):
-            seq_ids = seq_group.seq_ids
-            next_token_ids, parent_ids = sample_result
-            seq_outputs: list[SequenceOutput] = []
-            for parent_id, next_token_id, logprobs in zip(
-                    parent_ids, next_token_ids, group_sample_logprobs):
-                seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id,
-                                   logprobs))
-            sampler_output.append(
-                CompletionSequenceGroupOutput(seq_outputs,
-                                              group_prompt_logprobs))
-
-    # If not specified, store None values in SamplerOutput.
-    if on_device_tensors is not None:
-        (sampled_token_probs, logprobs_tensor,
-         sampled_token_ids) = on_device_tensors
-    else:
-        sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None,
-                                                                   None)
-
-    return SamplerOutput(
-        outputs=sampler_output,
-        sampled_token_probs=sampled_token_probs,
-        sampled_token_ids=sampled_token_ids,
-        logprobs=logprobs_tensor,
-        deferred_sample_results_args=deferred_sample_results_args)
-
-
-def _get_next_prompt_tokens(
-        seq_group: SequenceGroupToSample) -> tuple[int, ...]:
-    """Get a list of next prompt tokens to compute logprob from a
-        given sequence group.
-
-    It is used to compute prompt logprob. Imagine you have logprob for each
-    query token. Query token needs to know the next prompt token id to compute
-    prompt logprob. This is a helper to obtain next prompt token ids.
-
-    This API has to be used only when the caller knows seq_group is in prefill
-    stage.
-
-    Returns:
-        A list of next prompt tokens to compute logprob.
-    """
-    assert seq_group.is_prompt, (
-        "Caller should ensure the sequence group is in a prefill stage.")
-    seq_ids = seq_group.seq_ids
-    query_len = seq_group.query_len
-    assert query_len is not None
-    # prompt has only 1 seq id.
-    assert len(seq_ids) == 1
-    seq_data = seq_group.seq_data[seq_ids[0]]
-    computed_len = seq_data.get_num_computed_tokens()
-    prompt_tokens = seq_data.prompt_token_ids
-    # +1 because we are looking for a next prompt token.
-    next_token_index_start = computed_len + 1
-    next_token_index_end = min(computed_len + query_len + 1,
-                               len(prompt_tokens))
-    next_prompt_tokens = prompt_tokens[
-        next_token_index_start:next_token_index_end]
-    return next_prompt_tokens
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
deleted file mode 100644
index e6f1ca61dd29..000000000000
--- a/vllm/model_executor/pooling_metadata.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Any
-
-import torch
-
-from vllm.pooling_params import PoolingParams
-from vllm.utils import is_pin_memory_available
-
-
-class PoolingMetadata:
-    """Metadata for pooling operations in the Pooler layer.
-
-    This class holds the necessary information for pooling operations,
-    providing context for how to perform pooling and other related operations.
-
-    Attributes:
-        seq_groups: List of (seq_ids, pooling_params).
-        seq_data: A mapping of sequence ID to additional sequence data.
-        prompt_lens: List of the lengths of each prompt.
-    """
-
-    def __init__(
-        self,
-        seq_groups: list[tuple[list[int], PoolingParams]],
-        seq_data: dict[int, Any],  # Specific data related to sequences
-        prompt_lens: list[int],
-    ) -> None:
-        self.seq_groups = seq_groups
-        self.seq_data = seq_data
-        self.prompt_lens = prompt_lens
-
-    def __repr__(self) -> str:
-        return ("PoolingMetadata("
-                f"seq_groups={self.seq_groups}, "
-                f"seq_data={self.seq_data}, "
-                f"prompt_lens={self.prompt_lens})")
-
-    def __getitem__(self, indices: slice):
-        return PoolingMetadata(
-            seq_groups=self.seq_groups[indices],
-            seq_data=dict(list(self.seq_data.items())[indices]),
-            prompt_lens=self.prompt_lens[indices],
-        )
-
-
-@dataclass
-class PoolingTensors:
-    """Tensors for pooling."""
-
-    prompt_lens: torch.Tensor
-
-    @classmethod
-    def from_pooling_metadata(
-        cls,
-        pooling_metadata: "PoolingMetadata",
-        device: torch.device,
-    ) -> "PoolingTensors":
-        """
-        Create PoolingTensors from PoolingMetadata.
-
-        Args:
-            pooling_metadata: PoolingMetadata instance to convert.
-            device: Device to store the tensors.
-        """
-        # Convert prompt lengths to tensor
-        pin_memory = is_pin_memory_available()
-
-        prompt_lens_t = torch.tensor(
-            pooling_metadata.prompt_lens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
-
-        return cls(prompt_lens=prompt_lens_t.to(device=device,
-                                                non_blocking=True), )
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
deleted file mode 100644
index 56f0f0984bfa..000000000000
--- a/vllm/model_executor/sampling_metadata.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from array import array
-from dataclasses import dataclass
-from typing import Optional
-
-import torch
-
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.utils import (PyObjectCache, async_tensor_h2d,
-                        is_pin_memory_available, make_tensor_with_pad)
-
-_SAMPLING_EPS = 1e-5
-
-
-@dataclass
-class SequenceGroupToSample:
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ----------------------|
-    #                                   |-- query_len ---|
-
-    # Sequence ids for the sequence group in a previous step.
-    seq_ids: list[int]
-    sampling_params: SamplingParams
-    # seq_id -> sequence data.
-    seq_data: dict[int, SequenceData]
-    # The length of the sequence (all tokens seen in the past + new token to
-    # compute attention) of the sequence group. None if it is in a decode
-    # stage.
-    seq_len: Optional[int]
-    # The length of new query tokens to compute in the current step. None if it
-    # is in a decode stage. The length of query_len <= seq_len if chunked
-    # prefill is enabled.
-    query_len: Optional[int]
-    # A random number generator for sampling.
-    generator: Optional[torch.Generator]
-    # True if the sequence group is in prefill stage. False if it is in a
-    # decode stage.
-    is_prompt: bool
-    # Query token indices from logits. to compute prompt logprob. Empty if
-    # prompt logprob is not required.
-    prompt_logprob_indices: list[int]
-    # Sample token indices from logits. Empty if sampling is not required.
-    sample_indices: list[int]
-
-    @property
-    def do_sample(self):
-        return len(self.sample_indices) > 0
-
-    def __post_init__(self):
-        if len(self.prompt_logprob_indices) > 0:
-            assert self.sampling_params.prompt_logprobs is not None
-        if self.is_prompt:
-            assert self.seq_len is not None
-            assert self.query_len is not None
-
-
-def gen_seq_group_to_sample_builder(num_seqs: int):
-    return lambda: SequenceGroupToSample(
-        seq_ids=[0] * num_seqs,
-        sampling_params=None,
-        seq_data=None,  # type: ignore
-        seq_len=0,
-        query_len=0,
-        generator=None,
-        is_prompt=True,
-        prompt_logprob_indices=[],
-        sample_indices=[],
-    )
-
-
-class SamplingMetadataCache:
-    """Used to cache SamplingMetadata objects between scheduler iterations"""
-
-    def __init__(self):
-        self._seq_group_to_sample_cache: dict[int, PyObjectCache] = {}
-
-    def get_cached_seq_group_to_sample(self, num_seqs):
-        if num_seqs not in self._seq_group_to_sample_cache:
-            self._seq_group_to_sample_cache[num_seqs] = PyObjectCache(
-                gen_seq_group_to_sample_builder(num_seqs))
-
-        obj = self._seq_group_to_sample_cache[num_seqs].get_object()
-        return obj
-
-    def reset(self):
-        for cache in self._seq_group_to_sample_cache.values():
-            cache.reset()
-
-
-class SamplingMetadata:
-    """Metadata for input sequences. Used in sampler.
-
-    The usage is as follow;
-    ```
-    hidden_states = execute_model(...)
-    logits = hidden_states[sampling_metadata.selected_token_indices]
-    sample(logits)
-
-    def sample(logits):
-        # Use categorized_sample_indices for sampling....
-    ```
-
-    Args:
-        seq_groups: List of batched sequence groups.
-        selected_token_indices: (num_query_tokens_to_logprob). Indices to find
-            logits from the initial model output hidden states.
-        categorized_sample_indices: SamplingType -> token indices to sample.
-            Each token indices is 2D tensor of (num_indices, num_indices) where
-            the first item means the sample index within the returned logit
-            (before pruning padding), and the second item means the sample
-            index after pruning using selected_token_indices.
-            For example, if the returned logit is [1, 2, 3], and we select
-            [1, 2] for sampling, the pruned logit will be [2, 3]. In this case,
-            The first tuple is [1, 2] (sampled index within original logit),
-            and the second tuple is [0, 1] (sampled index within pruned logit).
-        num_prompts: Number of prompt sequence groups in seq_groups.
-        skip_sampler_cpu_output: Indicates if we want to skip the GPU=>CPU
-            serialization of token outputs.
-        reuse_sampling_tensors: Indicates if we want to reuse sampling
-            tensors that are part of the sampler forward pass. Currently,
-            it is mainly used for multi-step decode.
-
-    """
-
-    def __init__(
-        self,
-        seq_groups: list[SequenceGroupToSample],
-        selected_token_indices: torch.Tensor,
-        categorized_sample_indices: dict[SamplingType, torch.Tensor],
-        num_prompts: int,
-        skip_sampler_cpu_output: bool = False,
-        reuse_sampling_tensors: bool = False,
-    ) -> None:
-        self.seq_groups = seq_groups
-        self.selected_token_indices = selected_token_indices
-        self.categorized_sample_indices = categorized_sample_indices
-        self.num_prompts = num_prompts
-        self.skip_sampler_cpu_output = skip_sampler_cpu_output
-        self.reuse_sampling_tensors = reuse_sampling_tensors
-
-    @staticmethod
-    def prepare(
-        seq_group_metadata_list: list[SequenceGroupMetadata],
-        seq_lens: list[int],
-        query_lens: list[int],
-        device: str,
-        pin_memory: bool,
-        generators: Optional[dict[str, torch.Generator]] = None,
-        cache: Optional[SamplingMetadataCache] = None,
-    ) -> "SamplingMetadata":
-        (
-            seq_groups,
-            selected_token_indices,
-            categorized_sample_indices,
-            num_prompts,
-        ) = _prepare_seq_groups(seq_group_metadata_list, seq_lens, query_lens,
-                                device, generators, cache)
-        selected_token_indices = async_tensor_h2d(
-            selected_token_indices,
-            dtype=torch.long,
-            target_device=device,
-            pin_memory=pin_memory,
-        )
-        categorized_sample_indices = {
-            t:
-            async_tensor_h2d(
-                seq_ids,
-                dtype=torch.int,
-                target_device=device,
-                pin_memory=pin_memory,
-            )
-            for t, seq_ids in categorized_sample_indices.items()
-        }
-
-        sampling_metadata = SamplingMetadata(
-            seq_groups=seq_groups,
-            selected_token_indices=selected_token_indices,
-            categorized_sample_indices=categorized_sample_indices,
-            num_prompts=num_prompts,
-        )
-        return sampling_metadata
-
-    def __repr__(self) -> str:
-        return (
-            "SamplingMetadata("
-            f"seq_groups={self.seq_groups}, "
-            f"selected_token_indices={self.selected_token_indices}, "
-            f"categorized_sample_indices={self.categorized_sample_indices})")
-
-
-def _prepare_seq_groups(
-    seq_group_metadata_list: list[SequenceGroupMetadata],
-    seq_lens: list[int],
-    query_lens: list[int],
-    device: str,
-    generators: Optional[dict[str, torch.Generator]] = None,
-    cache: Optional[SamplingMetadataCache] = None,
-) -> tuple[
-        list[SequenceGroupToSample],
-        list[int],
-        dict[SamplingType, list[int]],
-        int,
-]:
-    """Prepare sequence groups and indices for sampling.
-
-    Args:
-        seq_group_metadata_list: A list of sequence group to batch.
-        seq_lens: A list of sequence lens per sequence group.
-            Index of prompt len should match with seq_group_metadata_list.
-        query_lens: A list of query lengths. Prompt lens include the length
-            of entire prompt tokens, and it could be shorter.
-        device: A device to use for random number generators,
-            `SequenceGroupToSample.generator`.
-        generators: A store of per-request random number generators used
-            for seeded requests.
-
-    Returns:
-        seq_groups: A list of sequence group to sample.
-        selected_token_indices: See the definition from `SamplingMetadata`.
-        categorized_sample_indices: See the definition from `SamplingMetadata`.
-        num_prompts: Total number of prompts from `seq_group_metadata_list`.
-    """
-    # Batched sequence groups for the current model forward stsep.
-    seq_groups: list[SequenceGroupToSample] = []
-    # A list of token indices to sample/compute logprob. It is used to
-    # prune the outcome logits from the model for the performance.
-    selected_token_indices: list[int] = []
-    # Used for selected_token_indices.
-    model_output_idx = 0
-
-    # Sampling type -> (
-    # indices to sample/prompt logprob within pruned output logits,
-    # indices to sample within pruned logits)
-    categorized_sample_indices: dict[SamplingType, list[int]] = {
-        t: []
-        for t in SamplingType
-    }
-    # Index of logits to compute logprob. Logits include both prompt logprob
-    # and sample logprob indices.
-    logit_idx = 0
-    # Total number of prompts from given sequence groups.
-    num_prompts = 0
-
-    for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-        seq_ids = seq_group_metadata.seq_data.keys()
-
-        if cache is not None:
-            sample_obj = cache.get_cached_seq_group_to_sample(len(seq_ids))
-
-            for j, seq_id in enumerate(seq_ids):
-                sample_obj.seq_ids[j] = seq_id
-
-            sample_obj.prompt_logprob_indices.clear()
-            sample_obj.sample_indices.clear()
-
-        sampling_params = seq_group_metadata.sampling_params
-        is_prompt = seq_group_metadata.is_prompt
-        generator: Optional[torch.Generator] = None
-        # If the current seq group is in decode stage, it is None.
-        seq_len: Optional[int] = None
-        query_len: Optional[int] = None
-        prompt_logprob_indices: list[int] = (sample_obj.prompt_logprob_indices
-                                             if cache is not None else [])
-        sample_indices: list[int] = (sample_obj.sample_indices
-                                     if cache is not None else [])
-        do_sample = seq_group_metadata.do_sample
-
-        if seq_group_metadata.is_prompt:
-            if sampling_params.seed is not None:
-                generator = torch.Generator(device=device).manual_seed(
-                    sampling_params.seed)
-                if generators is not None:
-                    generators[seq_group_metadata.request_id] = generator
-
-            num_prompts += 1
-            num_prefill_sample = len(seq_ids)
-            assert num_prefill_sample == 1
-            assert query_lens is not None and seq_lens is not None
-            query_len, seq_len = query_lens[i], seq_lens[i]
-            # If we need sampling, exclude num_prefill_sample tokens from
-            # prompt logprob.
-            prompt_logprob_len = (query_len - num_prefill_sample
-                                  if do_sample else query_len)
-            sample_len = num_prefill_sample if do_sample else 0
-        else:
-            # Decode
-            prompt_logprob_len = 0
-            query_len = query_lens[i] if query_lens is not None and len(
-                query_lens) > 0 else 1
-            sample_len = len(seq_ids) * query_len if do_sample else 0
-
-            if sampling_params.seed is not None and generators is not None:
-                generator = generators.get(seq_group_metadata.request_id)
-
-        # Update indices to select from the model output.
-        """
-        This blocks computes selected_token_indices which is used in the
-        following way.
-
-        hidden_states = model(...)
-        logits = hidden_states[selected_token_indices]
-        """
-
-        if sampling_params.prompt_logprobs is not None:
-            selected_token_indices.extend(
-                range(model_output_idx, model_output_idx + prompt_logprob_len))
-        model_output_idx += prompt_logprob_len
-        if do_sample:
-            selected_token_indices.extend(
-                range(model_output_idx, model_output_idx + sample_len))
-        model_output_idx += sample_len
-
-        # We now find indices for logprob computation and sampling.
-        """
-        This block computes categorized_sample_indices which is used in the
-        following way.
-
-        hidden_states = model(...)
-        logits = hidden_states[selected_token_indices]
-        def sample(logits):
-           # Use categorized_sample_indices for sampling.
-           # prompt_logprob_indices to find prompt logprob indices.
-           # sample_indices to find sample indices.
-        """
-
-        if sampling_params.prompt_logprobs is not None:
-            prompt_logprob_indices.extend(
-                range(logit_idx, logit_idx + prompt_logprob_len))
-            logit_idx += prompt_logprob_len
-        if do_sample:
-            sample_indices.extend(range(logit_idx, logit_idx + sample_len))
-            categorized_sample_indices[sampling_params.sampling_type].extend(
-                list(range(logit_idx, logit_idx + sample_len)))
-            logit_idx += sample_len
-
-        if cache is not None:
-            sample_obj.sampling_params = sampling_params
-            sample_obj.seq_data = seq_group_metadata.seq_data
-            sample_obj.seq_len = seq_len
-            sample_obj.query_len = query_len
-            sample_obj.generator = generator
-            sample_obj.is_prompt = is_prompt
-        else:
-            sample_obj = SequenceGroupToSample(
-                seq_ids=list(seq_ids),
-                sampling_params=sampling_params,
-                seq_data=seq_group_metadata.seq_data,
-                seq_len=seq_len,
-                query_len=query_len,
-                generator=generator,
-                is_prompt=is_prompt,
-                prompt_logprob_indices=list(prompt_logprob_indices),
-                sample_indices=list(sample_indices),
-            )
-
-        seq_groups.append(sample_obj)
-
-    if cache is not None:
-        cache.reset()
-
-    return (seq_groups, selected_token_indices, categorized_sample_indices,
-            num_prompts)
-
-
-@dataclass
-class SamplingTensors:
-    """Tensors for sampling."""
-
-    temperatures: torch.Tensor
-    top_ps: torch.Tensor
-    top_ks: torch.Tensor
-    min_ps: torch.Tensor
-    presence_penalties: torch.Tensor
-    frequency_penalties: torch.Tensor
-    repetition_penalties: torch.Tensor
-    prompt_tokens: torch.Tensor
-    output_tokens: torch.Tensor
-
-    @classmethod
-    def from_sampling_metadata(
-        cls,
-        sampling_metadata: "SamplingMetadata",
-        vocab_size: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> tuple["SamplingTensors", bool, bool, bool]:
-        prompt_tokens: list[array] = []
-        output_tokens: list[array] = []
-        top_ks: list[int] = []
-        temperatures: list[float] = []
-        top_ps: list[float] = []
-        min_ps: list[float] = []
-        presence_penalties: list[float] = []
-        frequency_penalties: list[float] = []
-        repetition_penalties: list[float] = []
-        do_penalties = False
-        do_top_p_top_k = False
-        do_min_p = False
-
-        assert sampling_metadata.seq_groups is not None
-        for seq_group in sampling_metadata.seq_groups:
-            seq_ids = seq_group.seq_ids
-            sampling_params = seq_group.sampling_params
-            temperature = sampling_params.temperature
-            p = sampling_params.presence_penalty
-            f = sampling_params.frequency_penalty
-            r = sampling_params.repetition_penalty
-            top_p = sampling_params.top_p
-            min_p = sampling_params.min_p
-
-            # k should not be greater than the vocab size.
-            top_k = min(sampling_params.top_k, vocab_size)
-            top_k = vocab_size if top_k < 1 else top_k
-            if temperature < _SAMPLING_EPS:
-                # NOTE: Zero temperature means deterministic sampling
-                # (i.e., greedy sampling or beam search).
-                # Set the temperature to 1 to avoid division by zero.
-                temperature = 1.0
-            if not do_top_p_top_k and (top_p < 1.0 - _SAMPLING_EPS
-                                       or top_k != vocab_size):
-                do_top_p_top_k = True
-            if not do_min_p and min_p > _SAMPLING_EPS:
-                do_min_p = True
-            if not do_penalties and (abs(p) >= _SAMPLING_EPS
-                                     or abs(f) >= _SAMPLING_EPS
-                                     or abs(r - 1.0) >= _SAMPLING_EPS):
-                do_penalties = True
-
-            is_prompt = seq_group.is_prompt
-            if is_prompt and sampling_params.prompt_logprobs is not None:
-                # For tokens in the prompt that we only need to get
-                # their logprobs
-                query_len = seq_group.query_len
-                assert query_len is not None
-                prefill_len = len(seq_group.prompt_logprob_indices)
-                temperatures += [temperature] * prefill_len
-                top_ps += [top_p] * prefill_len
-                top_ks += [top_k] * prefill_len
-                min_ps += [min_p] * prefill_len
-                presence_penalties += [0] * prefill_len
-                frequency_penalties += [0] * prefill_len
-                repetition_penalties += [1] * prefill_len
-
-            if seq_group.do_sample:
-                sample_lens = len(seq_group.sample_indices)
-                assert sample_lens >= len(seq_ids)
-                temperatures += [temperature] * sample_lens
-                top_ps += [top_p] * sample_lens
-                top_ks += [top_k] * sample_lens
-                min_ps += [min_p] * sample_lens
-                presence_penalties += [p] * sample_lens
-                frequency_penalties += [f] * sample_lens
-                repetition_penalties += [r] * sample_lens
-
-        if do_penalties:
-            for seq_group in sampling_metadata.seq_groups:
-                seq_ids = seq_group.seq_ids
-                sampling_params = seq_group.sampling_params
-                if (seq_group.is_prompt
-                        and sampling_params.prompt_logprobs is not None):
-                    prefill_len = len(seq_group.prompt_logprob_indices)
-                    prompt_tokens.extend(
-                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
-                        for _ in range(prefill_len))
-                    output_tokens.extend(
-                        array(VLLM_TOKEN_ID_ARRAY_TYPE)
-                        for _ in range(prefill_len))
-                if seq_group.do_sample:
-                    for seq_id in seq_ids:
-                        seq_data = seq_group.seq_data[seq_id]
-                        prompt_tokens.append(seq_data.prompt_token_ids_array)
-                        output_tokens.append(seq_data.output_token_ids_array)
-
-        sampling_tensors = SamplingTensors.from_lists(
-            temperatures,
-            top_ps,
-            top_ks,
-            min_ps,
-            presence_penalties,
-            frequency_penalties,
-            repetition_penalties,
-            prompt_tokens,
-            output_tokens,
-            vocab_size,
-            device,
-            dtype,
-        )
-        return (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p)
-
-    @classmethod
-    def from_lists(
-        cls,
-        temperatures: list[float],
-        top_ps: list[float],
-        top_ks: list[int],
-        min_ps: list[float],
-        presence_penalties: list[float],
-        frequency_penalties: list[float],
-        repetition_penalties: list[float],
-        prompt_tokens: list[array],
-        output_tokens: list[array],
-        vocab_size: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> "SamplingTensors":
-        # Note that the performance will be very bad without
-        # pinned memory.
-        pin_memory = is_pin_memory_available()
-
-        do_penalties = prompt_tokens or output_tokens
-
-        if do_penalties:
-            prompt_t = make_tensor_with_pad(
-                prompt_tokens,
-                vocab_size,
-                device="cpu",
-                dtype=torch.int64,
-                pin_memory=pin_memory,
-            )
-            output_t = make_tensor_with_pad(
-                output_tokens,
-                vocab_size,
-                device="cpu",
-                dtype=torch.int64,
-                pin_memory=pin_memory,
-            )
-        else:
-            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
-            prompt_t = empty_tensor
-            output_t = empty_tensor
-
-        temperatures_t = torch.tensor(
-            temperatures,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        top_ps_t = torch.tensor(
-            top_ps,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        min_ps_t = torch.tensor(
-            min_ps,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        presence_penalties_t = torch.tensor(
-            presence_penalties,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        frequency_penalties_t = torch.tensor(
-            frequency_penalties,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        repetition_penalties_t = torch.tensor(
-            repetition_penalties,
-            device="cpu",
-            dtype=dtype,
-            pin_memory=pin_memory,
-        )
-        top_ks_t = torch.tensor(
-            top_ks,
-            device="cpu",
-            dtype=torch.int,
-            pin_memory=pin_memory,
-        )
-        # Because the memory is pinned, we can do non-blocking
-        # transfer to device.
-
-        return cls(
-            temperatures=temperatures_t.to(device=device, non_blocking=True),
-            top_ps=top_ps_t.to(device=device, non_blocking=True),
-            top_ks=top_ks_t.to(device=device, non_blocking=True),
-            min_ps=min_ps_t.to(device=device, non_blocking=True),
-            presence_penalties=presence_penalties_t.to(device=device,
-                                                       non_blocking=True),
-            frequency_penalties=frequency_penalties_t.to(device=device,
-                                                         non_blocking=True),
-            repetition_penalties=repetition_penalties_t.to(device=device,
-                                                           non_blocking=True),
-            prompt_tokens=prompt_t.to(device=device, non_blocking=True),
-            output_tokens=output_t.to(device=device, non_blocking=True),
-        )
diff --git a/vllm/sequence.py b/vllm/sequence.py
deleted file mode 100644
index fe87b52f9df1..000000000000
--- a/vllm/sequence.py
+++ /dev/null
@@ -1,1534 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Sequence and its related classes."""
-import copy
-import enum
-from abc import ABC, abstractmethod
-from array import array
-from collections import defaultdict
-from collections.abc import Mapping
-from collections.abc import Sequence as GenericSequence
-from dataclasses import dataclass, field
-from functools import reduce
-from typing import Any, Callable, Optional, Union
-
-import msgspec
-import torch
-
-from vllm.inputs import SingletonInputs
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-
-VLLM_TOKEN_ID_ARRAY_TYPE = "l"
-
-VLLM_INVALID_TOKEN_ID = -1
-
-
-def array_full(token_id: int, count: int):
-    """[`array`][] equivalent of [numpy.full][]."""
-    return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
-
-
-# We use dataclass for now because it is used for
-# openai server output, and msgspec is not serializable.
-# TODO(sang): Fix it.
-@dataclass
-class Logprob:
-    """Infos for supporting OpenAI compatible logprobs and token ranks.
-
-    Attributes:
-        logprob: The logprob of chosen token
-        rank: The vocab rank of chosen token (>=1)
-        decoded_token: The decoded chosen token index
-    """
-    logprob: float
-    rank: Optional[int] = None
-    decoded_token: Optional[str] = None
-
-
-# {token_id -> logprob} per each sequence group. None if the corresponding
-# sequence group doesn't require prompt logprob.
-PromptLogprobs = list[Optional[dict[int, Logprob]]]
-# {token_id -> logprob} for each sequence group.
-SampleLogprobs = list[dict[int, Logprob]]
-
-
-class SequenceStatus(enum.IntEnum):
-    """Status of a sequence."""
-    WAITING = 0
-    RUNNING = 1
-    SWAPPED = 2
-    # Note: anything after SWAPPED (2) will be considered
-    # as a finished status.
-    FINISHED_STOPPED = 3
-    FINISHED_LENGTH_CAPPED = 4
-    FINISHED_ABORTED = 5
-    FINISHED_IGNORED = 6
-
-    @staticmethod
-    def is_finished(status: "SequenceStatus") -> bool:
-        return status > SequenceStatus.SWAPPED
-
-    @staticmethod
-    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
-        if status == SequenceStatus.FINISHED_STOPPED:
-            finish_reason = "stop"
-        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
-            finish_reason = "length"
-        elif status == SequenceStatus.FINISHED_ABORTED:
-            finish_reason = "abort"
-        elif status == SequenceStatus.FINISHED_IGNORED:
-            # The ignored sequences are the sequences whose prompt lengths
-            # are longer than the model's length cap. Therefore, the stop
-            # reason should also be "length" as in OpenAI API.
-            finish_reason = "length"
-        else:
-            finish_reason = None
-        return finish_reason
-
-
-class SequenceStage(enum.Enum):
-    PREFILL = enum.auto()
-    DECODE = enum.auto()
-
-
-@dataclass
-class RequestMetrics:
-    """Metrics associated with a request.
-
-    Attributes:
-        arrival_time: The time when the request arrived.
-        first_scheduled_time: The time when the request was first scheduled.
-        first_token_time: The time when the first token was generated.
-        time_in_queue: The time the request spent in the queue.
-        finished_time: The time when the request was finished.
-        scheduler_time: The time spent in the scheduler when this request was
-                        being considered by the scheduler.
-        model_forward_time: The time spent in the model forward pass when this
-                            request was in the batch.
-        model_execute_time: The time spent in the model execute function. This
-                            will include model forward, block/sync across
-                            workers, cpu-gpu sync time and sampling time.
-    """
-    arrival_time: float
-    last_token_time: float
-    first_scheduled_time: Optional[float]
-    first_token_time: Optional[float]
-    time_in_queue: Optional[float]
-    finished_time: Optional[float] = None
-    scheduler_time: Optional[float] = None
-    model_forward_time: Optional[float] = None
-    model_execute_time: Optional[float] = None
-
-
-class SequenceDataDelta(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True):  # type: ignore[call-arg]
-    """Delta SequenceData to send to workers per step."""
-    # A new token to be appended to existing SequenceData.
-    new_output_token_ids: list[int]
-    # Overwriting existing `cumulative_logprob`
-    new_cumulative_logprob: float
-    # Overwriting existing `num_computed_tokens`.
-    new_num_computed_tokens: int
-    # Overwriting existing `stage`.
-    new_stage: SequenceStage
-
-
-class SequenceData(msgspec.Struct,
-                   omit_defaults=True):  # type: ignore[call-arg]
-    """Data associated with a sequence.
-
-    Args:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output. Set to an empty list if
-            None.
-
-    Attributes:
-        prompt_token_ids: The token IDs of the prompt.
-        output_token_ids: The token IDs of the output.
-        cumulative_logprob: The cumulative log probability of the output.
-    """
-    # NOTE: we cannot use Union[list, array] because msgspec cannot support
-    # union of 2 list types.
-    _prompt_token_ids: array
-    _output_token_ids: array = msgspec.field(
-        default_factory=lambda: array(VLLM_TOKEN_ID_ARRAY_TYPE, []))
-
-    _prompt_embeds: Optional[torch.Tensor] = None
-    _output_embeds: Optional[torch.Tensor] = None
-
-    ### The below fields should not be passed as an argument ###
-    _cumulative_logprob: float = 0.0
-    _prompt_token_ids_tuple: tuple[int,
-                                   ...] = msgspec.field(default_factory=tuple)
-    # The number of tokens that are computed (that run against the model).
-    _num_computed_tokens: int = 0
-    # The number of tokens with prefix cache hit.
-    _num_cached_tokens: int = 0
-    _stage: SequenceStage = SequenceStage.PREFILL
-    _cached_all_token_ids: list[int] = msgspec.field(default_factory=list)
-    _cached_all_token_embeds: Optional[torch.Tensor] = None
-
-    # It is used to get delta input. It is reset when `get_delta_and_reset`
-    # is called.
-    _new_appended_tokens: list[int] = msgspec.field(default_factory=list)
-
-    # It is used to compute mrope_position_ids.
-    _mrope_position_delta: Optional[int] = None
-
-    @staticmethod
-    def from_prompt_token_counts(
-            *token_counts: tuple[int, int]) -> "SequenceData":
-        """
-        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
-        by concatenating prompt token sequences.
-
-        Each tuple represents one token sequence, expressed in the form
-        `(token_id, count)`.
-        """
-        if len(token_counts) == 0:
-            return SequenceData.from_seqs([])
-
-        prompt_token_ids_arr = reduce(
-            array.__iadd__,
-            (array_full(token_id, count) for token_id, count in token_counts),
-        )
-
-        return SequenceData(prompt_token_ids_arr)
-
-    @staticmethod
-    def from_seqs(
-        prompt_token_ids: GenericSequence[int],
-        output_token_ids: Optional[GenericSequence[int]] = None,
-        *,
-        prompt_embeds: Optional[torch.Tensor] = None,
-    ) -> "SequenceData":
-        """
-        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
-        from prompt and output token sequences.
-        """
-        prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                                     prompt_token_ids)
-
-        if output_token_ids is None:
-            return SequenceData(prompt_token_ids_arr,
-                                _prompt_embeds=prompt_embeds)
-
-        output_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                                     output_token_ids)
-
-        return SequenceData(prompt_token_ids_arr,
-                            _output_token_ids=output_token_ids_arr,
-                            _prompt_embeds=prompt_embeds)
-
-    def __post_init__(self) -> None:
-        assert self._prompt_token_ids.typecode == "l"
-        assert self._output_token_ids.typecode == "l"
-        self._prompt_token_ids_tuple: tuple[int, ...] = tuple(
-            self._prompt_token_ids)
-        self._update_cached_all_tokens()
-        if self._prompt_embeds is not None:
-            self._update_cached_all_token_embeds()
-
-    def _update_cached_all_tokens(self):
-        assert isinstance(self._prompt_token_ids, array)
-        assert isinstance(self._output_token_ids, array)
-        self._cached_all_token_ids: list[int] = list(self._prompt_token_ids +
-                                                     self._output_token_ids)
-
-    def _update_cached_all_token_embeds(self):
-        assert isinstance(self._prompt_embeds, torch.Tensor)
-        self._cached_all_token_embeds: torch.Tensor = self._prompt_embeds
-        if self._output_embeds is not None:
-            self._cached_all_token_embeds = torch.cat(
-                (self._cached_all_token_embeds, self._output_embeds), dim=0)
-
-    @property
-    def cumulative_logprob(self) -> float:
-        return self._cumulative_logprob
-
-    @property
-    def prompt_token_ids(self) -> tuple[int, ...]:
-        return self._prompt_token_ids_tuple
-
-    @prompt_token_ids.setter
-    def prompt_token_ids(self, new_prompt_token_ids) -> None:
-        raise NotImplementedError
-
-    @property
-    def prompt_token_ids_array(self) -> array:
-        """Return the prompt token ids in array type.
-
-        Note that the array is in "I" type, and it is not compatible
-        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
-        """
-        return self._prompt_token_ids
-
-    @property
-    def output_token_ids(self) -> tuple[int, ...]:
-        return tuple(self._output_token_ids)
-
-    @output_token_ids.setter
-    def output_token_ids(self,
-                         new_output_token_ids: GenericSequence[int]) -> None:
-        self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                                       new_output_token_ids)
-        self._update_cached_all_tokens()
-
-    @property
-    def output_embeds(self) -> Optional[torch.Tensor]:
-        return self._output_embeds
-
-    @output_embeds.setter
-    def output_embeds(self, new_output_token_embeds: torch.Tensor) -> None:
-        self._output_token_embeds = new_output_token_embeds
-        self._update_cached_all_token_embeds()
-
-    @property
-    def output_token_ids_array(self) -> array:
-        """Return the prompt token ids in array type.
-
-        Note that the array is in "I" type, and it is not compatible
-        with torch.long (2 bytes vs 4 bytes). So beware of the usage.
-        """
-        assert isinstance(self._output_token_ids, array)
-        return self._output_token_ids
-
-    @property
-    def prompt_embeds(self) -> Optional[torch.Tensor]:
-        return self._prompt_embeds
-
-    @prompt_embeds.setter
-    def prompt_embeds(self, prompt_embeds: torch.Tensor) -> None:
-        self._prompt_embeds = prompt_embeds
-        self._update_cached_all_token_embeds()
-
-    @property
-    def mrope_position_delta(self) -> Optional[int]:
-        return self._mrope_position_delta
-
-    @mrope_position_delta.setter
-    def mrope_position_delta(self, new_mrope_position_delta):
-        self._mrope_position_delta = new_mrope_position_delta
-
-    def append_token_id(self,
-                        token_id: int,
-                        logprob: float,
-                        token_embed: Optional[torch.Tensor] = None) -> None:
-        self._output_token_ids.append(token_id)
-        self._new_appended_tokens.append(token_id)
-        self._cached_all_token_ids.append(token_id)
-        self._cumulative_logprob += logprob
-        if token_embed is not None:
-            # Do not pass in with batch or sequence dimensions
-            assert token_embed.ndim == 1
-            token_embed = token_embed.detach().cpu().unsqueeze(0)
-            if self._output_embeds is None:
-                self._output_embeds = token_embed
-            else:
-                self._output_embeds = torch.cat(
-                    (self._output_embeds, token_embed), dim=0)
-            assert self._cached_all_token_embeds is not None
-            self._cached_all_token_embeds = torch.cat(
-                (self._cached_all_token_embeds,
-                 token_embed.to(device=self._cached_all_token_embeds.device)),
-                dim=0)
-
-    def get_len(self) -> int:
-        return len(self._output_token_ids) + len(self._prompt_token_ids)
-
-    def get_prompt_len(self) -> int:
-        return len(self._prompt_token_ids)
-
-    def get_output_len(self) -> int:
-        return len(self._output_token_ids)
-
-    def get_token_ids(self) -> list[int]:
-        return self._cached_all_token_ids
-
-    def get_token_embeddings(self) -> Optional[torch.Tensor]:
-        return self._cached_all_token_embeds
-
-    def get_prefix_token_ids(
-            self, num_tokens: int
-    ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]:
-        """Get prefix tokens, and make the return value hashable"""
-        prompt_length = self.get_prompt_len()
-        if num_tokens > prompt_length:
-            return (self._prompt_token_ids_tuple,
-                    tuple(self._output_token_ids[:num_tokens - prompt_length]))
-        else:
-            return (self._prompt_token_ids_tuple[:num_tokens], None)
-
-    def get_num_computed_tokens(self) -> int:
-        """Return the number of prefill tokens that are already computed."""
-        return self._num_computed_tokens
-
-    def update_num_computed_tokens(self, num_new_computed_tokens: int):
-        """Update number of tokens computed so far."""
-        self._num_computed_tokens += num_new_computed_tokens
-        assert self._num_computed_tokens <= self.get_len(), (
-            self._num_computed_tokens, self.get_len())
-        # If all tokens are computed, it means it is in decoding phase.
-        if self.get_num_uncomputed_tokens() == 0:
-            self._stage = SequenceStage.DECODE
-
-    def get_num_cached_tokens(self) -> int:
-        """Return the number of tokens with prefix cache hit."""
-        return self._num_cached_tokens
-
-    def update_num_cached_tokens(self, num_cached_tokens: int):
-        """Update the number of tokens with prefix cache hit."""
-        self._num_cached_tokens = num_cached_tokens
-
-    def reset_state_for_recompute(self) -> None:
-        """Reset the number of computed tokens from this sequence. It is
-        supposed to be called when a sequence needs to be started from
-        the beginning again (e.g., sequence is preempted).
-        """
-        self._num_computed_tokens = 0
-        self._stage = SequenceStage.PREFILL
-        self._new_appended_tokens = []
-
-    def get_num_uncomputed_tokens(self) -> int:
-        """Return the number of prefill tokens that are not computed."""
-        # we use `get_len()` which includes prompt_len + output_len instead
-        # of prompt_len here. This is because during recompute we need to
-        # prefill for both prompt and output.
-        return self.get_len() - self.get_num_computed_tokens()
-
-    def get_last_token_id(self) -> int:
-        if not self._output_token_ids:
-            return self._prompt_token_ids[-1]
-        return self._output_token_ids[-1]
-
-    def get_prompt_token_ids(self) -> tuple[int, ...]:
-        return self.prompt_token_ids
-
-    def get_output_token_ids(self) -> tuple[int, ...]:
-        return self.output_token_ids
-
-    def get_delta_and_reset(self) -> SequenceDataDelta:
-        delta = SequenceDataDelta(self._new_appended_tokens,
-                                  self._cumulative_logprob,
-                                  self.get_num_computed_tokens(), self.stage)
-        # Reset delta state.
-        self._new_appended_tokens = []
-        return delta
-
-    def apply_delta(self, delta: SequenceDataDelta):
-        self._num_computed_tokens = delta.new_num_computed_tokens
-        self._cumulative_logprob = delta.new_cumulative_logprob
-        self._stage = delta.new_stage
-        self._output_token_ids.extend(delta.new_output_token_ids)
-        self._cached_all_token_ids.extend(delta.new_output_token_ids)
-
-    @property
-    def stage(self) -> SequenceStage:
-        return self._stage
-
-    def __repr__(self) -> str:
-        return (f"SequenceData("
-                f"prompt_token_ids={self._prompt_token_ids}, "
-                f"prompt_embeds.shape="
-                f"{getattr(self._prompt_embeds, 'shape', None)}, "
-                f"output_token_ids={self.output_token_ids}, "
-                f"cumulative_logprob={self.cumulative_logprob}, "
-                f"get_num_computed_tokens={self.get_num_computed_tokens()})")
-
-
-class Sequence:
-    """Stores the data, status, and block information of a sequence.
-
-    The sequence is constructed from the
-    [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
-    or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
-    (for encoder-decoder) instance passed in through the `inputs`
-    constructor argument.
-
-    Args:
-        seq_id: The ID of the sequence.
-        inputs: The inputs of the sequence.
-        block_size: The block size of the sequence. Should be the same as the
-            block size used by the block manager and cache engine.
-        eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM.
-        lora_request: LoRA request.
-    """
-
-    def __init__(
-        self,
-        seq_id: int,
-        inputs: SingletonInputs,
-        block_size: int,
-        eos_token_id: Optional[int] = None,
-        lora_request: Optional[LoRARequest] = None,
-    ) -> None:
-        self.seq_id = seq_id
-        self.inputs = inputs
-        self.block_size = block_size
-        self.eos_token_id = eos_token_id
-        self.lora_request = lora_request
-
-        self.data = SequenceData.from_seqs(
-            self.prompt_token_ids,
-            prompt_embeds=self.inputs["prompt_embeds"]
-            if self.inputs["type"] == "embeds" else None)
-        self.output_logprobs: SampleLogprobs = []
-        self.output_text = ""
-
-        self.status = SequenceStatus.WAITING
-        self.stop_reason: Union[int, str, None] = None
-
-        # These are used to keep track of delta outputs
-        self._last_output_token_ids_offset: int = 0
-        self._last_output_text_offset: int = 0
-
-        # Used for incremental detokenization
-        self.prefix_offset = 0
-        self.read_offset = 0
-        # Input + output tokens
-        self.tokens: Optional[list[str]] = None
-
-    @property
-    def n_blocks(self) -> int:
-        return (self.get_len() + self.block_size - 1) // self.block_size
-
-    @property
-    def prompt(self) -> Optional[str]:
-        if self.inputs["type"] == "embeds":
-            return None
-        return self.inputs.get("prompt")
-
-    @property
-    def prompt_token_ids(self) -> list[int]:
-        if self.inputs["type"] == "embeds":
-            return [0] * len(self.inputs["prompt_embeds"])
-        return self.inputs["prompt_token_ids"]
-
-    @property
-    def token_type_ids(self) -> list[int]:
-        if self.inputs["type"] == "embeds":
-            return []
-        return self.inputs.get("token_type_ids", [])
-
-    @property
-    def multi_modal_data(self) -> MultiModalKwargs:
-        if self.inputs["type"] == "multimodal":
-            return self.inputs["mm_kwargs"]
-
-        return MultiModalKwargs({})
-
-    @property
-    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        if self.inputs["type"] == "multimodal":
-            return self.inputs["mm_placeholders"]
-
-        return {}
-
-    @property
-    def lora_int_id(self) -> int:
-        return self.lora_request.lora_int_id if self.lora_request else 0
-
-    def get_output_text_to_return(self, buffer_length: int,
-                                  delta: bool) -> str:
-        """If delta is True, only new text since the last call to
-        this method is returned"""
-
-        # We return the full output text if the sequence is finished.
-        truncate = buffer_length and not self.is_finished()
-        if not delta:
-            return self.output_text[:-buffer_length] if truncate else (
-                self.output_text)
-        length = len(self.output_text)
-        if truncate:
-            length -= buffer_length
-        last_offset = self._last_output_text_offset
-        if last_offset < length:
-            self._last_output_text_offset = length
-            return self.output_text[last_offset:length]
-        return ""
-
-    def get_output_token_ids_to_return(
-            self, delta: bool) -> Union[GenericSequence[int], int]:
-        """If delta is True, only new tokens since the last call to
-        this method are returned"""
-        if not delta:
-            return self.get_output_token_ids()
-
-        output_len = self.get_output_len()
-
-        # Get the number of new tokens
-        num_new_tokens = output_len - self._last_output_token_ids_offset
-        self._last_output_token_ids_offset = output_len
-
-        # Return new tokens
-        if num_new_tokens == 1:
-            # Optimization for single decode token case
-            # (which is what we have most of the time)
-            return self.data._cached_all_token_ids[-1]
-
-        if num_new_tokens == 0:
-            return []
-
-        return self.data._cached_all_token_ids[-num_new_tokens:]
-
-    def hash_of_block(self, logical_idx: int) -> int:
-        # TODO This can produce incorrect hash when block size > prompt size
-
-        # Compute the number of tokens in the sequence
-        # TODO: The current hashing function is O(L^2). We should optimize
-        # this in the future.
-        num_tokens = self.num_hashed_tokens_of_block(logical_idx)
-        hashed_tokens = self.data.get_prefix_token_ids(num_tokens)
-        return hash((hashed_tokens, self.lora_int_id))
-
-    def extra_hash(self) -> Optional[int]:
-        """
-        This function computes an extra hash for a sequence, specifically
-        designed for prefix caching mode. The final sequence hash is determined
-        by applying token_ids from the sequence's blocks.
-        """
-        if self.lora_int_id == 0:
-            return None
-
-        # NOTE: If there are additional factors influencing the block aside from
-        # token_ids, include them as input parameters to the hash.
-        return hash(self.lora_int_id)
-
-    def num_hashed_tokens_of_block(self, logical_idx: int):
-        return logical_idx * self.block_size + self.block_size
-
-    def reset_state_for_recompute(self):
-        """Reset the sequence states for recomputation."""
-        self.data.reset_state_for_recompute()
-
-    def append_token_id(self,
-                        token_id: int,
-                        logprobs: dict[int, Logprob],
-                        token_embed: Optional[torch.Tensor] = None) -> None:
-        assert token_id in logprobs
-        self.output_logprobs.append(logprobs)
-        self.data.append_token_id(token_id, logprobs[token_id].logprob,
-                                  token_embed)
-
-    def get_len(self) -> int:
-        return self.data.get_len()
-
-    def get_prompt_len(self) -> int:
-        return self.data.get_prompt_len()
-
-    def get_output_len(self) -> int:
-        return self.data.get_output_len()
-
-    def get_token_ids(self) -> list[int]:
-        return self.data.get_token_ids()
-
-    def get_prompt_token_ids(self) -> tuple[int, ...]:
-        return self.data.get_prompt_token_ids()
-
-    def get_last_token_id(self) -> int:
-        return self.data.get_last_token_id()
-
-    def get_output_token_ids(self) -> tuple[int, ...]:
-        return self.data.get_output_token_ids()
-
-    def get_cumulative_logprob(self) -> float:
-        return self.data.cumulative_logprob
-
-    def is_finished(self) -> bool:
-        return SequenceStatus.is_finished(self.status)
-
-    def fork(self, new_seq_id: int) -> "Sequence":
-        new_seq = copy.deepcopy(self)
-        new_seq.seq_id = new_seq_id
-        return new_seq
-
-    def get_num_new_tokens(self) -> int:
-        """Get the number of new tokens to be computed.
-
-        Returns:
-            The new number of tokens to be computed. I.e., 1 for decode, or
-            the remaining prompt size for prefill.
-        """
-        if self.data.stage == SequenceStage.DECODE:
-            return 1
-        return self.data.get_num_uncomputed_tokens()
-
-    def get_num_computed_tokens(self) -> int:
-        return self.data.get_num_computed_tokens()
-
-    def is_prefill(self) -> bool:
-        return self.data.stage == SequenceStage.PREFILL
-
-    def __repr__(self) -> str:
-        return (f"Sequence(seq_id={self.seq_id}, "
-                f"status={self.status.name}, "
-                f"num_blocks={self.n_blocks})")
-
-
-class SequenceGroupState(msgspec.Struct,
-                         omit_defaults=True):  # type: ignore[call-arg]
-    """Mutable state tied to a specific sequence group"""
-
-    # for multi-step decoding
-    num_steps: int = 1
-    current_step: int = 0
-
-    @property
-    def remaining_steps(self) -> int:
-        return self.num_steps - self.current_step
-
-
-class SequenceGroup:
-    """A group of sequences that are generated from the same prompt.
-
-    Args:
-        request_id: The ID of the request.
-        seqs: The list of sequences.
-        sampling_params: The sampling parameters used to generate the outputs.
-        arrival_time: The arrival time of the request.
-        lora_request: LoRA request.
-        pooling_params: The parameters used to generate the pooler
-            for a pooling model.
-        pooled_data: The extracted hidden states from a pooling model.
-        encoder_seq: Optional, the single encoder sequence. Should be None
-                     unless you are working with an encoder/decoder model.
-        trace_headers: OpenTelemetry trace headers.
-        priority: User-defined priority of the request.
-        draft_size: The number of speculative tokens plus one from the target
-                    model; equal to max number of tokens a step can generate
-                    for single-draft speculative decoding but larger than
-                    that for multi-draft SD (currently not supported).
-    """
-
-    def __init__(self,
-                 request_id: str,
-                 seqs: list[Sequence],
-                 arrival_time: float,
-                 sampling_params: Optional[SamplingParams] = None,
-                 lora_request: Optional[LoRARequest] = None,
-                 pooling_params: Optional[PoolingParams] = None,
-                 pooled_data: Optional[torch.Tensor] = None,
-                 encoder_seq: Optional[Sequence] = None,
-                 trace_headers: Optional[Mapping[str, str]] = None,
-                 priority: int = 0,
-                 draft_size: int = 1) -> None:
-        self.request_id = request_id
-        self.seqs = seqs
-        self.first_seq = seqs[0]
-        self.arrival_time = arrival_time
-        self.is_single_seq = len(seqs) == 1
-        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
-
-        self.sampling_params = sampling_params
-        self.metrics = RequestMetrics(arrival_time=arrival_time,
-                                      last_token_time=arrival_time,
-                                      first_scheduled_time=None,
-                                      first_token_time=None,
-                                      time_in_queue=None)
-        self.last_token_latency = 0.0
-        self.lora_request = lora_request
-        self.prompt_logprobs: Optional[PromptLogprobs] = None
-        self.state = SequenceGroupState()
-        self.pooling_params = pooling_params
-        self.pooled_data = pooled_data
-        self.encoder_seq = encoder_seq
-        self.trace_headers = trace_headers
-        self.priority = priority
-
-        self.cached_request_output = None
-
-    @property
-    def prompt(self) -> Optional[str]:
-        return self.first_seq.prompt
-
-    @property
-    def prompt_token_ids(self) -> list[int]:
-        return self.first_seq.prompt_token_ids
-
-    @property
-    def encoder_prompt(self) -> Optional[str]:
-        # There are either 0 or 1 encoder sequences
-        # If one is present, its prompt is distinct
-        # from the decoder's.
-        return (self.encoder_seq.prompt
-                if self.encoder_seq is not None else None)
-
-    @property
-    def encoder_prompt_token_ids(self) -> Optional[list[int]]:
-        # There are either 0 or 1 encoder sequences
-        # If one is present, its prompt token ids are
-        # distinct from the decoder's.
-        return (self.encoder_seq.prompt_token_ids
-                if self.encoder_seq is not None else None)
-
-    @property
-    def token_type_ids(self) -> Optional[list[int]]:
-        return self.first_seq.token_type_ids
-
-    @property
-    def multi_modal_data(self) -> MultiModalKwargs:
-        if self.first_seq.multi_modal_data:
-            return self.first_seq.multi_modal_data
-        elif self.encoder_seq is not None:
-            return self.encoder_seq.multi_modal_data
-        return MultiModalKwargs({})
-
-    @property
-    def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        if self.first_seq.multi_modal_data:
-            return self.first_seq.multi_modal_placeholders
-        elif self.encoder_seq is not None:
-            return self.encoder_seq.multi_modal_placeholders
-        return {}
-
-    @property
-    def lora_int_id(self) -> int:
-        return self.lora_request.lora_int_id if self.lora_request else 0
-
-    def init_multi_step(self, num_steps: int) -> None:
-        self.state.num_steps = num_steps
-        self.state.current_step = 0
-
-    def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int,
-                                             num_scheduler_steps: int,
-                                             is_multi_step: bool,
-                                             enable_chunking: bool) -> None:
-
-        if not is_multi_step:
-            self.init_multi_step(num_steps=num_scheduler_steps)
-            return
-
-        # Multi-Step case
-        is_prefill = self.is_prefill()
-
-        # The asserts below reflect the expectations of the current system.
-        if is_prefill and enable_chunking:
-            assert num_lookahead_slots == num_scheduler_steps
-            self.init_multi_step(num_steps=num_lookahead_slots)
-        else:
-            is_decode: bool = not is_prefill
-            # If it is a prefill, num_lookahead_slots must be 0
-            assert num_lookahead_slots == 0 or is_decode
-            # If it is a decode, num_lookahead_slots + 1 must match
-            # the scheduler steps.
-            assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill
-            self.init_multi_step(num_steps=num_lookahead_slots + 1)
-
-    def set_last_token_time(self, now: float) -> None:
-        """Sets the last token time for Request level timings."""
-        # If still in prefill phase, assertion fails.
-        assert not self.is_prefill(), (
-            "seq_group.set_last_token_time() should not be called "
-            "if the seq_group is in prefill phase.")
-        self.last_token_latency = now - self.metrics.last_token_time
-        self.metrics.last_token_time = now
-
-    def get_last_token_latency(self) -> float:
-        """Returns the latency of the last token."""
-        assert not self.is_prefill(), (
-            "seq_group.get_last_token_latency() should not be called "
-            "if the seq_group is in prefill phase.")
-        return self.last_token_latency
-
-    def maybe_set_first_token_time(self, time: float) -> None:
-        """Sets the first token time for Request level timings."""
-        # Note: in a case where a sequence_group is swapped and
-        #   recomputed, the time between iterations is counted
-        #   in TPOT, rather than recalculating TTFT (since from the )
-        #   POV of the user, there is simply a long generation delay.
-        if (self.metrics.first_token_time is None
-                and self.first_seq.get_output_len() == 1):
-            self.metrics.first_token_time = time
-
-    def maybe_set_first_scheduled_time(self, time: float) -> None:
-        """Sets the first scheduled time and time in queue for Request
-        level timings."""
-        if self.metrics.first_scheduled_time is None:
-            self.metrics.first_scheduled_time = time
-            self.metrics.time_in_queue = time - self.metrics.arrival_time
-
-    def set_finished_time(self, time: Optional[float]) -> None:
-        """Sets the finished time for Request level timings."""
-        self.metrics.finished_time = time
-
-    def get_max_num_running_seqs(self) -> int:
-        """The maximum number of sequences running in parallel in the remaining
-        lifetime of the request."""
-        if self.is_single_seq:
-            return 0 if self.first_seq.is_finished() else 1
-        return self.num_seqs() - self.num_finished_seqs()
-
-    def get_seqs(
-        self,
-        status: Optional[SequenceStatus] = None,
-    ) -> list[Sequence]:
-        if status is None:
-            return self.seqs
-
-        if self.is_single_seq:
-            return self.seqs if self.first_seq.status == status else []
-
-        return [seq for seq in self.seqs if seq.status == status]
-
-    def is_encoder_decoder(self) -> bool:
-        return self.encoder_seq is not None
-
-    def get_encoder_seq(self) -> Optional[Sequence]:
-        return self.encoder_seq
-
-    def get_finished_seqs(self) -> list[Sequence]:
-        if self.is_single_seq:
-            return self.seqs if self.first_seq.is_finished() else []
-
-        return [seq for seq in self.seqs if seq.is_finished()]
-
-    def update_num_computed_tokens(self, num_new_computed_tokens: int):
-        """Update number of tokens computed so far."""
-        for seq in self.seqs:
-            if not seq.is_finished():
-                seq.data.update_num_computed_tokens(num_new_computed_tokens)
-
-    def get_num_uncomputed_tokens(self) -> int:
-        num_uncomputed_tokens = 0
-        for seq in self.seqs:
-            if not seq.is_finished():
-                num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens()
-        return num_uncomputed_tokens
-
-    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
-        # Optimization. We don't need to call get_seqs if we don't need to
-        # filter by states.
-        if status is None:
-            return len(self.seqs)
-
-        if self.is_single_seq:
-            return 1 if self.seqs[0].status == status else 0
-
-        return len(self.get_seqs(status))
-
-    def num_finished_seqs(self) -> int:
-        if self.is_single_seq:
-            return 1 if self.seqs[0].is_finished() else 0
-        return len(self.get_finished_seqs())
-
-    def is_finished(self) -> bool:
-        if self.is_single_seq:
-            return self.first_seq.is_finished()
-        return all(seq.is_finished() for seq in self.seqs)
-
-    def is_prefill(self) -> bool:
-        return self.first_seq.is_prefill()
-
-    def __repr__(self) -> str:
-        return (f"SequenceGroup(request_id={self.request_id}, "
-                f"sampling_params={self.sampling_params}, "
-                f"num_seqs={len(self.seqs)})")
-
-    def uses_prompt_embeds(self) -> bool:
-        """Returns True if the sequence group uses input embeds."""
-        return any(seq.data.prompt_embeds is not None for seq in self.seqs)
-
-
-class SequenceGroupMetadataDelta(
-        msgspec.Struct,
-        tag=True,  # type: ignore[call-arg]
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True):  # type: ignore[call-arg]
-    """Delta of SequenceGroupMetadata.
-
-    After sending the first SequenceGroupMetadata, vLLM scheduler
-    only sends delta to reduce the data payload size.
-    """
-    seq_data_delta: dict[int, SequenceDataDelta]
-    request_id: str
-    block_tables: dict[int, list[int]]
-    is_prompt: bool
-    do_sample: bool = True
-    token_chunk_size: Optional[int] = None
-    computed_block_nums: Optional[list[int]] = None
-    state: Optional[SequenceGroupState] = msgspec.field(
-        default_factory=lambda: SequenceGroupState())
-
-
-class SequenceGroupMetadata(
-        msgspec.Struct,
-        tag=True,  # type: ignore[call-arg]
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True):  # type: ignore[call-arg]
-    """Metadata for a sequence group. Used to create `AttentionMetadata`.
-
-    Args:
-        request_id: The ID of the request.
-        is_prompt: Whether the request is at prompt stage.
-        seq_data: The sequence data. (Seq id -> sequence data)
-        sampling_params: The sampling parameters used to generate the outputs.
-        block_tables: The block tables. (Seq id -> list of physical block
-            numbers)
-        do_sample: True if sampling is required. Sampling is not required when
-            e.g., prefill is chunked, and the current iteration only computes
-            query tokens for prefill, we don't need sampling.
-        token_chunk_size: The number of tokens to be processed (per sequence).
-            None if chunking is not required.
-        lora_request: LoRA request.
-        computed_block_nums: The block numbers that are already computed,
-            used in prefix caching.
-        state: Internal state tied to this sequence group.
-        multi_modal_data: Multi modal data.
-        mm_processor_kwargs: Multimodal input processor / mapper overrides.
-        encoder_seq_data: Optional sequence data for encoder prompt
-                          (SequenceGroup.encoder_seq). Should be None
-                          unless you are working with an encoder/decoder
-                          model.
-        cross_block_table: Optional cross-attention block table associated
-                           with the encoder prompt
-                           (SequenceGroup.encoder_seq). Should be None
-                           unless you are working with an encoder/decoder
-                           model.
-    """
-
-    request_id: str
-    is_prompt: bool
-    seq_data: dict[int, SequenceData]
-    sampling_params: Optional[SamplingParams]
-    block_tables: dict[int, list[int]]
-    do_sample: bool = True
-    pooling_params: Optional[PoolingParams] = None
-    lora_request: Optional[LoRARequest] = None
-    computed_block_nums: Optional[list[int]] = None
-    state: Optional[SequenceGroupState] = msgspec.field(
-        default_factory=lambda: SequenceGroupState())
-    token_type_ids: Optional[list[int]] = None
-    multi_modal_data: Optional[MultiModalKwargs] = None
-    multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
-    encoder_seq_data: Optional[SequenceData] = None
-    cross_block_table: Optional[list[int]] = None
-    token_chunk_size: Optional[int] = None
-
-    ### Stateful fields that are lazily defined. ###
-    # The number of speculative tokens adopted in this request.
-    # None means specuative decoding is not used.
-    # Zero means speculative decoding is disabled for some reasons.
-    # TODO: We should maintain this states out of the sequence group.
-    num_speculative_tokens: Optional[int] = None
-
-    def __post_init__(self):
-        if self.seq_data is not None and self.token_chunk_size is None:
-            if self.is_prompt:
-                self.token_chunk_size = next(iter(
-                    self.seq_data.values())).get_len()
-            else:
-                self.token_chunk_size = 1
-
-    @property
-    def lora_int_id(self) -> int:
-        return self.lora_request.lora_int_id if self.lora_request else 0
-
-    # Multi-Step Chunked-Prefill property
-    @property
-    def is_single_step_prompt(self) -> bool:
-        # do_sample is true, only when the token_chunk_size matches the
-        # num_uncomputed_tokens of the sequence. This indicates that
-        # the prompt will finish processing in a single `execute_model`
-        # step.
-        return self.is_prompt and self.do_sample
-
-    def get_first_seq_id(self) -> int:
-        # This is an efficient way of fetching the seq_id when
-        # we know this SequenceGroup has only one sequence.
-        return next(iter(self.seq_data))
-
-    def apply_delta(self,
-                    sequence_group_metadata_delta: SequenceGroupMetadataDelta):
-        for id, delta in sequence_group_metadata_delta.seq_data_delta.items():
-            self.seq_data[id].apply_delta(delta)
-        assert self.request_id == sequence_group_metadata_delta.request_id
-        self.block_tables = sequence_group_metadata_delta.block_tables
-        self.token_chunk_size = sequence_group_metadata_delta.token_chunk_size
-        self.do_sample = sequence_group_metadata_delta.do_sample
-        self.is_prompt = sequence_group_metadata_delta.is_prompt
-
-    def finish_step(self) -> None:
-        assert self.state is not None
-        assert self.state.current_step < self.state.num_steps, \
-            f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa
-        self.state.current_step += 1
-
-
-class SequenceOutput(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
-    """The model output associated with a sequence.
-
-    Args:
-        parent_seq_id: The ID of the parent sequence (for forking in beam
-            search).
-        output_token: The output token ID.
-        logprobs: The logprobs of the output token.
-            (Token id -> logP(x_i+1 | x_0, ..., x_i))
-    """
-    parent_seq_id: int
-    output_token: int
-    logprobs: dict[int, Logprob]
-    output_embed: Optional[torch.Tensor] = None
-
-    def __repr__(self) -> str:
-        output_embed_shape = \
-            self.output_embed.shape if self.output_embed is not None else None
-        return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
-                f"output_token={self.output_token}, "
-                f"output_embed.shape={output_embed_shape}, "
-                f"logprobs={self.logprobs})")
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, SequenceOutput):
-            raise NotImplementedError()
-        equal = (self.parent_seq_id == other.parent_seq_id
-                 and self.output_token == other.output_token)
-        log_probs_equal = other.logprobs == self.logprobs
-        return equal and log_probs_equal
-
-
-class SequenceGroupOutput(ABC):
-    """The base class for model outputs associated with a sequence group."""
-
-    @abstractmethod
-    def __repr__(self) -> str:
-        pass
-
-    @abstractmethod
-    def __eq__(self, other: object) -> bool:
-        pass
-
-
-class CompletionSequenceGroupOutput(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
-    """The model output associated with a completion sequence group."""
-    __metaclass__ = SequenceGroupOutput
-    samples: list[SequenceOutput]
-    # Prompt logprob for each prompt query token.
-    prompt_logprobs: Optional[PromptLogprobs]
-    step_index: Optional[int] = 0
-
-    def __repr__(self) -> str:
-        return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
-                f"prompt_logprobs={self.prompt_logprobs})")
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, CompletionSequenceGroupOutput):
-            raise NotImplementedError()
-        return (self.samples == other.samples
-                and self.prompt_logprobs == other.prompt_logprobs)
-
-
-class PoolingSequenceGroupOutput(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True,  # type: ignore[call-arg]
-):
-    """The model output associated with a pooling sequence group."""
-    __metaclass__ = SequenceGroupOutput
-    # Annotated as Any to be compatible with msgspec
-    # The actual type is in SequenceGroup.pooled_data
-    data: Any
-
-    def get_data_nbytes(self) -> int:
-        data: torch.Tensor = self.data
-        return data.nbytes
-
-    def __repr__(self) -> str:
-        return f"PoolingSequenceGroupOutput(data={self.data}"
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, PoolingSequenceGroupOutput):
-            raise NotImplementedError()
-        return self.data == other.data
-
-
-# cannot use msgspec.Struct here because Dynamo does not support it
-@dataclass
-class IntermediateTensors:
-    """For all pipeline stages except the last, we need to return the hidden
-    states and residuals to be sent to the next stage. This data structure
-    contains the hidden states and residuals for a request.
-    
-    Each stage also needs to handle its own finished_sending and 
-    finished_recving in case of kv transfer.
-    """
-
-    tensors: dict[str, torch.Tensor]
-    # [req_ids]
-    finished_sending: Optional[set[str]] = None
-    finished_recving: Optional[set[str]] = None
-
-    def __init__(self, tensors):
-        # manually define this function, so that
-        # Dynamo knows `IntermediateTensors()` comes from this file.
-        # Otherwise, dataclass will generate this function by evaluating
-        # a string, and we will lose the information about the source file.
-        self.tensors = tensors
-
-    def __getitem__(self, key: Union[str, slice]):
-        if isinstance(key, str):
-            return self.tensors[key]
-        elif isinstance(key, slice):
-            return self.__class__({k: v[key] for k, v in self.tensors.items()})
-
-    def __setitem__(self, key: str, value: torch.Tensor):
-        self.tensors[key] = value
-
-    def items(self):
-        return self.tensors.items()
-
-    def __len__(self):
-        return len(self.tensors)
-
-    def __eq__(self, other: object):
-        return isinstance(other, self.__class__) and self
-
-    def __repr__(self) -> str:
-        return f"IntermediateTensors(tensors={self.tensors})"
-
-
-class PoolerOutput(
-        msgspec.Struct,
-        omit_defaults=True,  # type: ignore[call-arg]
-        array_like=True):  # type: ignore[call-arg]
-    """The output from a pooling operation in the pooling model."""
-    outputs: list[PoolingSequenceGroupOutput]
-
-    def get_data_nbytes(self) -> int:
-        return sum(o.get_data_nbytes() for o in self.outputs)
-
-    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
-        return self.outputs[idx]
-
-    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
-        self.outputs[idx] = value
-
-    def __len__(self):
-        return len(self.outputs)
-
-    def __eq__(self, other: object):
-        return isinstance(other,
-                          self.__class__) and self.outputs == other.outputs
-
-
-def get_all_seq_ids(
-        seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]:
-    """Given a list of SequenceGroupMetadata, create a list of all
-    sequence ids.
-    """
-    return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
-
-
-def get_all_seq_ids_and_request_ids(
-    seq_group_metadata_list: list[SequenceGroupMetadata]
-) -> tuple[list[int], dict[str, set[int]]]:
-    """Given a list of SequenceGroupMetadata, create a list of all
-    sequence ids.
-    """
-    seq_ids: list[int] = []
-    request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set)
-    for sg in seq_group_metadata_list:
-        for seq_id in sg.seq_data:
-            seq_ids.append(seq_id)
-            request_id_seq_ids_mapping[sg.request_id].add(seq_id)
-    return seq_ids, request_id_seq_ids_mapping
-
-
-class HiddenStates(msgspec.Struct, array_like=True,
-                   omit_defaults=True):  # type: ignore[call-arg]
-    """Hidden states corresponding to in-progress sequences.
-    Used in speculative decoding to pass hidden states from
-    the target model to the proposer model.
-
-    seq_ids are the sequence ids of each entry of the batch
-    dimension of the hidden_states tensor"""
-    # Scorer hidden states. For prefill step, it is used for hidden states of
-    # all tokens, whereas for decode step, it use used for last accepted tokens.
-    hidden_states: torch.Tensor
-    # The sequence group metadata list. Only needed for decode step.
-    seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
-    # Scorer hidden states of the 2nd last token proposed by the proposer (
-    # irrespective of whether it was accepted or not). Only used for cases when
-    # last proposed token is accepted (i.e., in case of bonus tokens). For the
-    # case of no bonus tokens, these are ignored.
-    second_last_token_hidden_states: Optional[torch.Tensor] = None
-
-    _seq_ids: list[int] = msgspec.field(default_factory=list)
-
-    def __post_init__(self):
-        if self.seq_group_metadata_list is not None:
-            assert len(self.seq_group_metadata_list) == len(self.hidden_states)
-            self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list)
-
-    @property
-    def seq_ids(self) -> list[int]:
-        return self._seq_ids
-
-    def update(self,
-               hidden_states: torch.Tensor,
-               seq_group_metadata_list: list[SequenceGroupMetadata],
-               second_last_token_hidden_states: Optional[torch.Tensor] = None):
-        """Update hidden states from target model invocation. Only used for
-        decode steps"""
-        assert len(seq_group_metadata_list) == len(hidden_states)
-        self._seq_ids.extend(get_all_seq_ids(seq_group_metadata_list))
-        self.hidden_states = torch.cat([self.hidden_states, hidden_states])
-
-        if self.second_last_token_hidden_states is not None:
-            # Adding dummy hidden_states to this to maintain same shape
-            self.second_last_token_hidden_states = torch.cat([
-                self.second_last_token_hidden_states,
-                torch.zeros_like(hidden_states)
-                if second_last_token_hidden_states is None else
-                second_last_token_hidden_states
-            ])
-
-    def prune(self,
-              seq_group_metadata_list: list[SequenceGroupMetadata]) -> None:
-        """Prune to provided list of sequence ids. Only used for decode steps.
-        """
-        # Currently this prunes all seq_ids not present in
-        # seq_group_metadata_list which might cause problems where a sequence
-        # may be "paused" then "resumed" later. This should only prune sequences
-        # which are confirmed to be aborted.
-        seq_ids = get_all_seq_ids(seq_group_metadata_list)
-        # Only keep sequence IDs that exist in self._seq_ids
-        seq_ids = [seq_id for seq_id in seq_ids if seq_id in self._seq_ids]
-        if seq_ids != self._seq_ids:
-            # Batch contents changed - prune removed sequences.
-            index = [self._seq_ids.index(seq_id) for seq_id in seq_ids]
-            self.hidden_states = self.hidden_states[index]
-            if self.second_last_token_hidden_states is not None:
-                self.second_last_token_hidden_states = self\
-                    .second_last_token_hidden_states[index]
-            self._seq_ids = seq_ids
-
-    def expand_with_bonus_tokens(
-            self, seq_with_bonus_token_in_last_step: set) -> None:
-        """Expand hidden states for sequences with bonus tokens. This is in
-        alignment with `MultiStepWorker._expand_execute_model_request`."""
-        if self.second_last_token_hidden_states is None \
-            or not seq_with_bonus_token_in_last_step:
-            return
-
-        index = []
-        for seq_id in self._seq_ids:
-            i = self._seq_ids.index(seq_id)
-            if seq_id in seq_with_bonus_token_in_last_step:
-                index.append(i + len(self._seq_ids))
-            index.append(i)
-
-        self.hidden_states = torch.cat(
-            [self.hidden_states, self.second_last_token_hidden_states])[index]
-
-
-class ExecuteModelRequest(
-        msgspec.Struct,
-        array_like=True,  # type: ignore[call-arg]
-        omit_defaults=True):  # type: ignore[call-arg]
-    """The model execution request, containing CPU metadata only. The LLM
-    engine should create an instance of this class for each request batch."""
-    # The sequence group metadata list.
-    seq_group_metadata_list: list[Union[SequenceGroupMetadata,
-                                        SequenceGroupMetadataDelta]]
-    # Blocks to swap in. List of CPU -> GPU block number.
-    blocks_to_swap_in: list[tuple[int,
-                                  int]] = msgspec.field(default_factory=list)
-    # Blocks to swap out. List of GPU -> CPU block number.
-    blocks_to_swap_out: list[tuple[int,
-                                   int]] = msgspec.field(default_factory=list)
-    # Blocks to copy. Source to dest block.
-    blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list)
-    # Virtual engine ID for pipeline parallel.
-    virtual_engine: int = 0
-    # The number of slots for lookahead decoding.
-    num_lookahead_slots: int = 0
-    # The number of requests in the running queue.
-    running_queue_size: int = 0
-    # Optional hidden states from prior step.
-    previous_hidden_states: Optional[HiddenStates] = None
-    # The number of forward steps to run.
-    num_steps: int = 1
-    # Finished request ids since last step.
-    finished_requests_ids: list[str] = msgspec.field(default_factory=list)
-    # The last sampled token ids for multi step decoding.
-    last_sampled_token_ids: Optional[torch.Tensor] = None
-    # Async callback
-    async_callback: Optional[Callable] = None
-
-    @property
-    def is_first_multi_step(self) -> bool:
-        # TODO(will) make this be able to handle batches with variable number of
-        # steps
-        assert len(self.seq_group_metadata_list) > 0
-        first_seq_group = self.seq_group_metadata_list[0]
-        assert first_seq_group.state is not None
-        return first_seq_group.state.current_step == 0
-
-    @property
-    def is_last_step(self) -> bool:
-        # TODO(will) make this be able to handle batches with variable number of
-        # steps
-        assert len(self.seq_group_metadata_list) > 0
-        first_seq_group = self.seq_group_metadata_list[0]
-        assert first_seq_group.state is not None
-        return first_seq_group.state.remaining_steps == 1
-
-    @property
-    def current_step(self) -> int:
-        # TODO(will) make this be able to handle batches with variable number of
-        # steps
-        assert len(self.seq_group_metadata_list) > 0
-        state = self.seq_group_metadata_list[0].state
-        assert state is not None
-        return state.current_step
-
-    def clone(
-        self, seq_group_metadata_list: list[Union[SequenceGroupMetadata,
-                                                  SequenceGroupMetadataDelta]]
-    ) -> "ExecuteModelRequest":
-        """Clone the request with a new sequence group metadata list."""
-        return ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list,
-            blocks_to_swap_in=self.blocks_to_swap_in.copy(),
-            blocks_to_swap_out=self.blocks_to_swap_out.copy(),
-            blocks_to_copy=self.blocks_to_copy.copy(),
-            virtual_engine=self.virtual_engine,
-            num_lookahead_slots=self.num_lookahead_slots,
-            running_queue_size=self.running_queue_size,
-            previous_hidden_states=self.previous_hidden_states,
-            num_steps=self.num_steps,
-            finished_requests_ids=self.finished_requests_ids,
-            last_sampled_token_ids=self.last_sampled_token_ids.clone()
-            if self.last_sampled_token_ids is not None else None,
-            async_callback=self.async_callback)
-
-
-@dataclass
-class SequenceGroupBase:
-    group_id: str  # the original request id before splitting
-
-    assembled_seq_group: Optional[SequenceGroup] = None
-
-    # seq id to a unique index inside this group
-    seq_id_to_index: dict[str, int] = field(default_factory=dict)
-
-    # seq ids to be finished
-    to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict)
-
-    # seq id to finished sequences
-    finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict)
-
-    streaming: bool = False
-
-    output_produced: bool = False
-
-    @staticmethod
-    def add_request(request_id: str, engine, params, *args, **kwargs):
-        """When we are ready to add a request with request_id and params
-        into the engine, we can split the request into multiple requests.
-        """
-        raise NotImplementedError
-
-    def finish_seq(self, seq: SequenceGroup):
-        """The sequence `seq` finishes, we should record the information.
-        """
-        del self.to_be_finished[seq.request_id]
-        self.finished_reqs[seq.request_id] = seq
-
-    def maybe_assemble_group(
-            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
-        """Assemble the sequence group, for producing the final
-        output, or adding request in the engine again.
-        """
-        raise NotImplementedError
-
-
-class ParallelSampleSequenceGroup(SequenceGroupBase):
-
-    @staticmethod
-    def add_request(request_id: str, engine, params, **kwargs):
-        original_params = params
-        group = ParallelSampleSequenceGroup(request_id)
-        seqs = []
-        for i in range(original_params.n):
-            request_id_i = f"{request_id}_parallel_sample_{i}"
-            group.seq_id_to_index[request_id_i] = i
-            params = original_params.clone()
-            params.n = 1
-            if params.seed is not None:
-                params.seed += i
-            seq_group = engine._add_processed_request(
-                request_id_i,
-                params=params,
-                **kwargs,
-            )  # type: ignore
-            assert seq_group is not None
-            engine.seq_id_to_seq_group[request_id_i] = group
-            group.to_be_finished[request_id_i] = seq_group
-            seqs.append(seq_group.seqs[0])
-
-        # for parallel sampling, the `assembled_seq_group` is always
-        # available, since we have all the sequences ready, and they
-        # will not change.
-        group.assembled_seq_group = SequenceGroup(
-            request_id=request_id,
-            seqs=seqs,
-            arrival_time=seq_group.arrival_time,
-            sampling_params=original_params,
-            lora_request=seq_group.lora_request,
-            pooling_params=seq_group.pooling_params,
-            pooled_data=seq_group.pooled_data,
-            encoder_seq=seq_group.encoder_seq,
-            trace_headers=seq_group.trace_headers,
-            priority=seq_group.priority,
-        )
-
-        group.streaming = params.output_kind == RequestOutputKind.DELTA
-        group.output_produced = False
-
-    def maybe_assemble_group(
-            self, seq_group: SequenceGroup) -> Optional[SequenceGroup]:
-
-        # in the streaming mode, we will return the assembled sequence
-        # for the first remaining sequence, and then return None for the
-        # rest of sequences
-        if self.streaming:
-            first_remaining_id = next(iter(self.to_be_finished))
-            if seq_group.request_id == first_remaining_id:
-                return self.assembled_seq_group
-            return None
-
-        # in the non-streaming mode, we will return the assembled sequence
-        # when the last sequences finishes, and then return None for the
-        # rest of the time
-        if (len(self.to_be_finished) == 1
-                and seq_group.request_id in self.to_be_finished
-                and seq_group.is_finished()):
-            assert self.assembled_seq_group is not None
-            params = self.assembled_seq_group.sampling_params
-            assert isinstance(params, SamplingParams)
-            if not self.output_produced:
-                self.output_produced = True
-                if params._real_n is not None:
-                    # Get the top-n sequences.
-                    n = params._real_n or params.n
-                    seqs = self.assembled_seq_group.seqs
-                    sorting_key = lambda seq: seq.get_cumulative_logprob()
-                    sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
-                    top_n_seqs = sorted_seqs[:n]
-                    self.assembled_seq_group.seqs = top_n_seqs
-                return self.assembled_seq_group
-            if self.output_produced:
-                return None
-        return None
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2f5504ea14b4..881ccabd6527 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,7 +9,6 @@
 from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
 
-from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
@@ -126,7 +125,7 @@ def update(self, new_token_ids: list[int],
         # 2) Evaluate stop strings.
         stop_string = None
         if self.stop:
-            stop = StopChecker.check_stop_strings(
+            stop = check_stop_strings(
                 output_text=self.output_text,
                 new_char_count=len(self.output_text) - offset_before,
                 stop=self.stop,
@@ -290,3 +289,42 @@ def decode_next(self, next_token_id: int) -> str:
         self.read_offset = read_offset
 
         return decoded_text
+
+
+def check_stop_strings(
+    output_text: str,
+    new_char_count: int,
+    stop: list[str],
+    include_in_output: bool,
+) -> Optional[tuple[str, int]]:
+    """Check if any stop strings are matched and truncate sequence
+    output text accordingly.
+
+    Returns tuple (stop_string, offset) if matched or else None.
+
+    Where stop_string is the matched stop string and offset is the
+    length to which output_text should be truncated, or -1 for no
+    truncation.
+    """
+    if not new_char_count or not stop:
+        return None
+
+    for stop_str in stop:
+        stop_string_len = len(stop_str)
+        # Avoid searching already-searched text.
+        stop_index = output_text.find(stop_str,
+                                      1 - new_char_count - stop_string_len)
+        if stop_index == -1:
+            continue
+
+        if include_in_output:
+            # Truncate to end of stop string.
+            stop_index += stop_string_len
+            if stop_index >= len(output_text):
+                # No truncation required.
+                return stop_str, -1
+
+        # Truncate the output text to either the beginning
+        # or end of the stop string.
+        return stop_str, stop_index
+    return None
diff --git a/vllm/worker/__init__.py b/vllm/worker/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
deleted file mode 100644
index 530907012f70..000000000000
--- a/vllm/worker/cache_engine.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""CacheEngine class for managing the KV cache."""
-from typing import List
-
-import torch
-
-from vllm.attention import get_attn_backend
-from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
-from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
-                        get_dtype_size, is_pin_memory_available)
-
-logger = init_logger(__name__)
-
-
-class CacheEngine:
-    """Manages the KV cache.
-
-    This class is responsible for initializing and managing the GPU and CPU KV
-    caches. It also provides methods for performing KV cache operations, such
-    as swapping and copying.
-    """
-
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        device_config: DeviceConfig,
-    ) -> None:
-        self.cache_config = cache_config
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.device_config = device_config
-
-        self.head_size = model_config.get_head_size()
-        # Models like Jamba, have mixed typed layers, E.g Mamba
-        self.num_attention_layers = model_config.get_num_layers_by_block_type(
-            parallel_config, LayerBlockType.attention)
-        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-
-        self.block_size = cache_config.block_size
-        self.num_gpu_blocks = cache_config.num_gpu_blocks
-        if self.num_gpu_blocks:
-            self.num_gpu_blocks //= parallel_config.pipeline_parallel_size
-        self.num_cpu_blocks = cache_config.num_cpu_blocks
-        if self.num_cpu_blocks:
-            self.num_cpu_blocks //= parallel_config.pipeline_parallel_size
-
-        if cache_config.cache_dtype == "auto":
-            self.dtype = model_config.dtype
-        else:
-            self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-
-        # Get attention backend.
-        self.attn_backend = get_attn_backend(self.head_size,
-                                             model_config.dtype,
-                                             cache_config.cache_dtype,
-                                             self.block_size,
-                                             model_config.is_attention_free,
-                                             use_mla=model_config.use_mla)
-
-        # Initialize the cache.
-        self.gpu_cache = self._allocate_kv_cache(
-            self.num_gpu_blocks, self.device_config.device_type)
-        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
-
-    def _allocate_kv_cache(
-        self,
-        num_blocks: int,
-        device: str,
-    ) -> List[torch.Tensor]:
-        """Allocates KV cache on the specified device."""
-        kv_cache_generic_shape = self.attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size)
-        pin_memory = is_pin_memory_available() if device == "cpu" else False
-        kv_cache: List[torch.Tensor] = []
-        try:
-            kv_cache_stride_order = self.attn_backend.get_kv_cache_stride_order(
-            )
-        except (AttributeError, NotImplementedError):
-            kv_cache_stride_order = tuple(range(len(kv_cache_generic_shape)))
-
-        # The allocation respects the backend-defined stride order to ensure
-        # the semantic remains consistent for each backend. We first obtain the
-        # generic kv cache shape and then permute it according to the stride
-        # order which could result in a non-contiguous tensor.
-        kv_cache_allocation_shape = tuple(kv_cache_generic_shape[i]
-                                          for i in kv_cache_stride_order)
-
-        for _ in range(self.num_attention_layers):
-            # null block in CpuGpuBlockAllocator requires at least that
-            # block to be zeroed-out.
-            # We zero-out everything for simplicity.
-            layer_kv_cache = torch.zeros(
-                kv_cache_allocation_shape,
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device=device).permute(*kv_cache_stride_order)
-
-            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
-            # when entry_shape is higher than 1D
-            kv_cache.append(layer_kv_cache)
-        return kv_cache
-
-    def swap_in(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_attention_layers):
-            self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
-                                          src_to_dst)
-
-    def swap_out(self, src_to_dst: torch.Tensor) -> None:
-        for i in range(self.num_attention_layers):
-            self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
-                                          src_to_dst)
-
-    def copy(self, src_to_dsts: torch.Tensor) -> None:
-        self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
-
-    @staticmethod
-    def get_cache_block_size(
-        cache_config: CacheConfig,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-    ) -> int:
-        head_size = model_config.get_head_size()
-        num_heads = model_config.get_num_kv_heads(parallel_config)
-        num_attention_layers = model_config.get_num_layers_by_block_type(
-            parallel_config, LayerBlockType.attention)
-
-        if cache_config.cache_dtype == "auto":
-            dtype = model_config.dtype
-        else:
-            dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
-
-        key_cache_entry = num_heads * head_size
-
-        # For MLA there is no value cache, since the latent vector
-        # is joint keys and values.
-        value_cache_entry = key_cache_entry if not model_config.use_mla else 0
-        total = num_attention_layers * cache_config.block_size * \
-            (key_cache_entry + value_cache_entry)
-
-        dtype_size = get_dtype_size(dtype)
-        return dtype_size * total
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
deleted file mode 100644
index cb5d5664ab5c..000000000000
--- a/vllm/worker/enc_dec_model_runner.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import itertools
-from typing import Any, Dict, List, Optional, Tuple, Type, cast
-
-import torch
-import torch.distributed
-
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
-from vllm.attention.backends.utils import PAD_SLOT_ID
-from vllm.attention.selector import (get_env_variable_attn_backend,
-                                     get_global_forced_attn_backend)
-from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
-from vllm.platforms import _Backend
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput,
-                           SequenceGroupMetadata)
-from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
-from vllm.worker.model_runner import (GPUModelRunnerBase,
-                                      ModelInputForGPUBuilder,
-                                      ModelInputForGPUWithSamplingMetadata)
-from vllm.worker.model_runner_base import (
-    _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict)
-from vllm.worker.utils import assert_enc_dec_mr_supported_scenario
-
-logger = init_logger(__name__)
-LORA_WARMUP_RANK = 8
-
-
-@dataclasses.dataclass(frozen=True)
-class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
-    """
-    Used by the EncoderDecoderModelRunner.
-    """
-    encoder_input_tokens: Optional[torch.Tensor] = None
-    encoder_input_positions: Optional[torch.Tensor] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "inputs_embeds": self.inputs_embeds,
-            "input_positions": self.input_positions,
-            "encoder_input_tokens": self.encoder_input_tokens,
-            "encoder_input_positions": self.encoder_input_positions,
-            "virtual_engine": self.virtual_engine,
-            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
-            "finished_requests_ids": self.finished_requests_ids,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "EncoderDecoderModelInput":
-        return cast(
-            EncoderDecoderModelInput,
-            super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
-
-
-class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
-    _model_input_cls: Type[EncoderDecoderModelInput] = (
-        EncoderDecoderModelInput)
-    _builder_cls: Type[ModelInputForGPUBuilder] = (ModelInputForGPUBuilder)
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        input_registry: InputRegistry = INPUT_REGISTRY,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-        '''
-        EncoderDecoderModelRunner constructor.
-
-        `lora_config` is unused (since these features are not yet supported
-        for encoder/decoder models) but these arguments are present here for
-        compatibility with the base-class constructor.
-        '''
-        self._maybe_force_supported_attention_backend()
-
-        super().__init__(
-            vllm_config=vllm_config,
-            kv_cache_dtype=kv_cache_dtype,
-            is_driver_worker=is_driver_worker,
-            input_registry=input_registry,
-            mm_registry=mm_registry,
-        )
-
-        # Crash for unsupported encoder/scenarios
-        assert_enc_dec_mr_supported_scenario(self)
-
-    def _maybe_force_supported_attention_backend(self):
-        '''
-        Force vLLM to use the XFormers attention backend,
-        which is currently the only supported option.
-        '''
-
-        def raise_backend_err():
-            # The user has specified an attention backend override
-            # which is invalid for encoder/decoder models
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_BACKEND)
-
-        maybe_env_var_forced_backend = get_env_variable_attn_backend()
-        maybe_global_forced_backend = get_global_forced_attn_backend()
-        is_forced_by_global = maybe_global_forced_backend is not None
-        is_forced_by_env_var = maybe_env_var_forced_backend is not None
-        if is_forced_by_global:  # noqa: SIM102
-            # Backend override enforced by global variable takes
-            # precedence over vLLM backend environment variable.
-            if maybe_global_forced_backend not in\
-                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
-                raise_backend_err()
-        elif is_forced_by_env_var:  # noqa: SIM102
-            # Backend override enforced by vLLM backend
-            # environment variable
-            if maybe_env_var_forced_backend not in\
-                 [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
-                raise_backend_err()
-
-    def _list_to_int32_tensor(
-        self,
-        _list: List[int],
-    ) -> torch.Tensor:
-        return torch.tensor(_list, dtype=torch.int32, device=self.device)
-
-    def _list_to_long_tensor(
-        self,
-        _list: List[int],
-    ) -> torch.Tensor:
-        return torch.tensor(_list, dtype=torch.long, device=self.device)
-
-    def _empty_int32_tensor(self) -> torch.Tensor:
-        return self._list_to_int32_tensor([])
-
-    def _empty_long_tensor(self) -> torch.Tensor:
-        return self._list_to_long_tensor([])
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: EncoderDecoderModelInput,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[PoolerOutput]]:
-        if num_steps > 1:
-            raise ValueError("num_steps > 1 is not supported in "
-                             "EncoderDecoderModelRunner")
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-        if (model_input.attn_metadata is not None
-                and model_input.attn_metadata.prefill_metadata is None
-                and model_input.attn_metadata.decode_metadata.use_cuda_graph):
-            if model_input.inputs_embeds is None:
-                assert model_input.input_tokens is not None
-                graph_batch_size = model_input.input_tokens.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, False)])
-            else:
-                graph_batch_size = model_input.inputs_embeds.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, True)])
-        else:
-            model_executable = self.model
-
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 model_input.virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                inputs_embeds=model_input.inputs_embeds,
-                positions=model_input.input_positions,
-                encoder_input_ids=model_input.encoder_input_tokens,
-                encoder_positions=model_input.encoder_input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(
-                    multi_modal_kwargs,
-                    device=self.device,
-                ),
-                **seqlen_agnostic_kwargs,
-            )
-
-        logits = self.model.compute_logits(hidden_or_intermediate_states,
-                                           model_input.sampling_metadata)
-
-        if not self.is_driver_worker:
-            return []
-
-        if model_input.async_callback is not None:
-            model_input.async_callback()
-
-        # Sample the next token.
-        output: SamplerOutput = self.sampler(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-
-        return [output]
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> EncoderDecoderModelInput:
-        return EncoderDecoderModelInput.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> EncoderDecoderModelInput:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-
-        Since chunked prefill is not supported for encoder/decoder models,
-        `input_tokens` is assumed to be either entirely prefill tokens or
-        entirely decode tokens.
-
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        (
-            attn_metadata,
-            encoder_input_tokens_tensor,
-            encoder_input_positions_tensor,
-        ) = (self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
-                                                       model_input))
-        # Inject attn_metadata encoder/cross-attention fields &
-        # encoder input tokens/positions into model_input.
-        # Frozen dataclass fields cannot be modified, so use
-        # dataclasses.replace to construct a new model input
-        # instance.
-        model_input = dataclasses.replace(
-            model_input,
-            attn_metadata=attn_metadata,
-            encoder_input_tokens=encoder_input_tokens_tensor,
-            encoder_input_positions=encoder_input_positions_tensor,
-        )
-
-        generators = self.get_generators(finished_requests_ids)
-        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
-                                                     model_input.seq_lens,
-                                                     model_input.query_lens,
-                                                     self.device,
-                                                     self.pin_memory,
-                                                     generators=generators)
-        is_prompt = (seq_group_metadata_list[0].is_prompt
-                     if seq_group_metadata_list else None)
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   is_prompt=is_prompt,
-                                   virtual_engine=virtual_engine)
-
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-
-        # This represents the maximum number of different requests
-        # that will have unique loras, and therefore the max amount of
-        # memory consumption. Create dummy lora request copies from the
-        # lora request passed in, which contains a lora from the lora
-        # warmup path.
-        dummy_lora_requests: List[LoRARequest] = []
-        dummy_lora_requests_per_seq: List[LoRARequest] = []
-        if self.lora_config:
-            dummy_lora_requests = self._add_dummy_loras(
-                self.lora_config.max_loras)
-            assert len(dummy_lora_requests) == self.lora_config.max_loras
-            dummy_lora_requests_per_seq = [
-                dummy_lora_requests[idx % len(dummy_lora_requests)]
-                for idx in range(max_num_seqs)
-            ]
-
-        # Profile memory usage with max_num_sequences sequences and the total
-        # number of tokens equal to max_num_batched_tokens.
-        seqs: List[SequenceGroupMetadata] = []
-
-        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-            self.model_config)
-        if max_mm_tokens > 0:
-            logger.info("Starting profile run for multi-modal models.")
-
-        batch_size = 0
-        for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
-            batch_size += seq_len
-
-            decoder_dummy_data = self.input_registry \
-                .dummy_data_for_profiling(self.model_config,
-                                          seq_len,
-                                          self.mm_registry,
-                                          is_encoder_data=False)
-            encoder_dummy_data = self.input_registry \
-                .dummy_data_for_profiling(self.model_config,
-                                          seq_len,
-                                          self.mm_registry,
-                                          is_encoder_data=True)
-
-            # Having more tokens is over-conservative but otherwise fine
-            assert len(
-                decoder_dummy_data.seq_data.prompt_token_ids
-            ) >= seq_len, (
-                f"Expected at least {seq_len} dummy tokens for profiling, "
-                f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}"
-            )
-
-            assert decoder_dummy_data.multi_modal_data is None or \
-            encoder_dummy_data.multi_modal_data is None, (
-                "Multi-modal data can't be provided in both encoder and decoder"
-            )
-
-            seq = SequenceGroupMetadata(
-                request_id=str(group_id),
-                is_prompt=True,
-                seq_data={group_id: decoder_dummy_data.seq_data},
-                sampling_params=sampling_params,
-                block_tables=None,
-                encoder_seq_data=encoder_dummy_data.seq_data,
-                cross_block_table=None,
-                lora_request=dummy_lora_requests_per_seq[group_id]
-                if dummy_lora_requests_per_seq else None,
-                multi_modal_data=decoder_dummy_data.multi_modal_data
-                or encoder_dummy_data.multi_modal_data,
-                multi_modal_placeholders=decoder_dummy_data.
-                multi_modal_placeholders
-                or encoder_dummy_data.multi_modal_placeholders)
-            seqs.append(seq)
-
-        finished_requests_ids = [seq.request_id for seq in seqs]
-        model_input = self.prepare_model_input(
-            seqs, finished_requests_ids=finished_requests_ids)
-        intermediate_tensors = None
-        self.execute_model(model_input, None, intermediate_tensors)
-        torch.cuda.synchronize()
-        return
-
-    def _prepare_encoder_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        model_input: EncoderDecoderModelInput,
-    ) -> Tuple[AttentionMetadata, Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
-        """Helper method to prepare the encoder- and cross-attn-related
-        model inputs based on a given sequence group. These additional inputs
-        are used to augment an already-computed `EncoderDecoderModelInput`
-        data structure which already has decoder-related model inputs
-        populated.
-
-        Sets the following attn_metadata fields:
-        * `num_encoder_tokens`
-        * `encoder_seq_lens`
-        * `encoder_seq_lens_tensor`
-        * `max_encoder_seq_len`
-        * `cross_slot_mapping`
-        * `cross_block_tables`
-
-        Constructs a new model inputs data structure, based on
-        (1) the existing fields in the `model_inputs` argument,
-        and (2) the following additional fields which are
-        computed (or in the case of `attn_metadata`, updated) 
-        by this function:
-        * attn_metadata
-        * encoder_input_tokens
-        * encoder_input_positions
-
-        Arguments:
-
-        * seq_group_metadata_list: list of sequence groups for which to
-                                   compute inputs
-        * model_inputs: model inputs data structure with decoder-oriented
-                        fields already computed.
-
-        Return:
-
-        * Updated model inputs data structure
-        """
-
-        if len(seq_group_metadata_list) == 0:
-            return (model_input.attn_metadata, None, None)
-
-        # Since we are not supporting chunked prefill either the entire
-        # batch is prefill or it is decode
-        is_prompt = seq_group_metadata_list[0].is_prompt
-
-        # Build encoder inputs
-        encoder_seq_lens: List[int] = []
-        if is_prompt:
-            # Prefill phase.
-            cross_block_tables = self._empty_int32_tensor().view(
-                len(seq_group_metadata_list), -1)
-
-            # Extract input tokens/positions, cross-attention slot-mapping,
-            # & seq len from each sequence group metadata
-            (
-                encoder_input_tokens,
-                encoder_input_positions,
-                cross_slot_mapping,
-            ) = (
-                [],
-                [],
-                [],
-            )
-            for seq_group_metadata in seq_group_metadata_list:
-                # Build seq lens
-                seq_len = seq_group_metadata.encoder_seq_data.get_len()
-                token_ids = seq_group_metadata.encoder_seq_data.get_token_ids()
-                encoder_seq_lens.append(seq_len)
-
-                # Build slot mapping
-                is_profile_run = (seq_group_metadata.block_tables is None)
-                if is_profile_run:
-                    # During memory profiling, the block tables are not
-                    # initialized yet. In this case, we just use a dummy
-                    # slot mapping.
-                    # In embeddings, the block tables are {seq_id: None}.
-                    cross_slot_mapping.extend([PAD_SLOT_ID] * seq_len)
-                else:
-                    for i in range(0, seq_len):
-                        block_number = seq_group_metadata.cross_block_table[
-                            i // self.block_size]
-                        block_offset = i % self.block_size
-                        slot = block_number * self.block_size + block_offset
-                        cross_slot_mapping.append(slot)
-
-                # Build encoder input tokens
-                encoder_input_tokens.extend(token_ids)
-                encoder_input_positions.extend(list(range(0, seq_len)))
-
-            # Convert tokens/positions & cross-attention
-            # slot-mapping to encoder input tensors
-            encoder_input_tokens_tensor = self._list_to_long_tensor(
-                encoder_input_tokens)
-            encoder_input_positions_tensor = self._list_to_long_tensor(
-                encoder_input_positions)
-            cross_slot_mapping_tensor = self._list_to_long_tensor(
-                cross_slot_mapping)
-
-        else:
-            # Decode phase.
-            encoder_input_tokens_tensor = self._empty_long_tensor()
-            encoder_input_positions_tensor = self._empty_long_tensor()
-            cross_slot_mapping_tensor = self._empty_long_tensor()
-            # Extract cross-attention block tables &
-            # seq len from each sequence group metadata.
-            # Cross-attention block tables are empty
-            # during vLLM memory profiling.
-            cross_block_tables = []
-            for seq_group_metadata in seq_group_metadata_list:
-                for _ in range(len(seq_group_metadata.seq_data)):
-                    encoder_seq_lens.append(
-                        seq_group_metadata.encoder_seq_data.get_len())
-                    cross_block_table = seq_group_metadata.cross_block_table
-                    cross_block_tables.append([] if (
-                        cross_block_table is None) else cross_block_table)
-
-            if (model_input.attn_metadata is not None
-                    and model_input.attn_metadata.use_cuda_graph):
-                # We will be using CUDA graph replay for this decode.
-                max_len_of_block_table = self.get_max_block_per_batch()
-                batch_size = len(encoder_seq_lens)
-                graph_batch_size = self.vllm_config.pad_for_cudagraph(
-                    batch_size)
-                assert graph_batch_size >= batch_size
-                cuda_graph_pad_size = graph_batch_size - batch_size
-                # extend the cross_block_tables and encoder_seq_lens to match
-                # the graph_batch_size.
-                cross_block_tables.extend([[]
-                                           for _ in range(cuda_graph_pad_size)
-                                           ])
-                encoder_seq_lens.extend(
-                    itertools.repeat(1, cuda_graph_pad_size))
-
-            else:
-                max_len_of_block_table = max(
-                    len(block_table) for block_table in cross_block_tables)
-
-            cross_block_tables = make_tensor_with_pad(
-                cross_block_tables,
-                max_len=max_len_of_block_table,
-                pad=0,
-                dtype=torch.int32,
-                device=self.device,
-            )
-
-        # Compute encoder sequence lengths & encoder
-        # sequence starting offset tensors
-        max_encoder_seq_len = max(encoder_seq_lens, default=0)
-        encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens)
-        encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] +
-                                            1,
-                                            dtype=torch.int32,
-                                            device=self.device)
-        torch.cumsum(encoder_seq_lens_tensor,
-                     dim=0,
-                     dtype=encoder_seq_start_loc.dtype,
-                     out=encoder_seq_start_loc[1:])
-
-        # Update attention metadata with encoder-oriented attributes
-        attn_metadata = model_input.attn_metadata
-        assert attn_metadata is not None
-        (
-            attn_metadata.num_encoder_tokens,
-            attn_metadata.encoder_seq_lens,
-            attn_metadata.encoder_seq_lens_tensor,
-            attn_metadata.max_encoder_seq_len,
-            attn_metadata.encoder_seq_start_loc,
-            attn_metadata.cross_slot_mapping,
-            attn_metadata.cross_block_tables,
-        ) = (
-            sum(encoder_seq_lens),
-            encoder_seq_lens,
-            encoder_seq_lens_tensor,
-            max_encoder_seq_len,
-            encoder_seq_start_loc,
-            cross_slot_mapping_tensor,
-            cross_block_tables,
-        )
-
-        return (attn_metadata, encoder_input_tokens_tensor,
-                encoder_input_positions_tensor)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
deleted file mode 100644
index 5a185e7451ad..000000000000
--- a/vllm/worker/model_runner.py
+++ /dev/null
@@ -1,2045 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import gc
-import inspect
-import itertools
-import time
-import weakref
-from contextlib import contextmanager
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set,
-                    Tuple, Type, TypeVar, Union)
-
-import numpy as np
-import torch
-import torch.distributed
-import torch.nn as nn
-from tqdm.auto import tqdm
-
-import vllm.envs as envs
-from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.attention.backends.abstract import AttentionState
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.config import CompilationLevel, VllmConfig
-from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
-from vllm.distributed.kv_transfer import get_kv_transfer_group
-from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
-                                             graph_capture)
-from vllm.forward_context import get_forward_context, set_forward_context
-from vllm.inputs import INPUT_REGISTRY, InputRegistry
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.model_executor import SamplingMetadata, SamplingMetadataCache
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput,
-                                                get_sampler)
-from vllm.model_executor.model_loader import get_model
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.model_executor.models import supports_lora, supports_multimodal
-from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalPlaceholderMap,
-                             MultiModalRegistry)
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
-                        async_tensor_h2d, flatten_2d_lists,
-                        is_pin_memory_available, supports_dynamo,
-                        weak_ref_tensor)
-from vllm.worker.model_runner_base import (
-    InputProcessingError, ModelRunnerBase, ModelRunnerInputBase,
-    ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict,
-    _add_sampling_metadata_broadcastable_dict,
-    _init_attn_metadata_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-LORA_WARMUP_RANK = 8
-
-_NUM_WARMUP_ITERS = 2
-
-TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
-
-# For now, bump up cache limits for recompilations during CUDA graph warmups.
-torch._dynamo.config.cache_size_limit = 128
-torch._dynamo.config.accumulated_cache_size_limit = 128
-
-
-@dataclass(frozen=True)
-class ModelInputForGPU(ModelRunnerInputBase):
-    """
-    This base class contains metadata needed for the base model forward pass
-    but not metadata for possible additional steps, e.g., sampling. Model
-    runners that run additional steps should subclass this method to add
-    additional fields.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    inputs_embeds: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    token_types: Optional[torch.Tensor] = None
-    seq_lens: Optional[List[int]] = None
-    query_lens: Optional[List[int]] = None
-    lora_mapping: Optional["LoRAMapping"] = None
-    lora_requests: Optional[Set[LoRARequest]] = None
-    attn_metadata: Optional["AttentionMetadata"] = None
-    multi_modal_kwargs: Optional[BatchedTensorInputs] = None
-    request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
-    finished_requests_ids: Optional[List[str]] = None
-    virtual_engine: int = 0
-    async_callback: Optional[Callable] = None
-    scheduler_outputs: Optional[SchedulerOutputs] = None
-    previous_hidden_states: Optional[torch.Tensor] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "inputs_embeds": self.inputs_embeds,
-            "input_positions": self.input_positions,
-            "lora_requests": self.lora_requests,
-            "lora_mapping": self.lora_mapping,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-            "virtual_engine": self.virtual_engine,
-            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
-            "finished_requests_ids": self.finished_requests_ids,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type[TModelInputForGPU],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> TModelInputForGPU:
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-    # Exclude `async_callback` to be able to pickle this object
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        del state["async_callback"]
-        return state
-
-    # TODO: What happens when we depickle this object?
-    # How can we update this callback to properly pass it to the engine?
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.__dict__.update({'async_callback': None})
-
-
-@dataclass(frozen=True)
-class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU):
-    """
-    Used by the ModelRunner.
-    """
-    sampling_metadata: Optional["SamplingMetadata"] = None
-    # Used for speculative decoding. We do not broadcast it because it is only
-    # used by the driver worker.
-    is_prompt: Optional[bool] = None
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        tensor_dict = {
-            "input_tokens": self.input_tokens,
-            "inputs_embeds": self.inputs_embeds,
-            "input_positions": self.input_positions,
-            "lora_requests": self.lora_requests,
-            "lora_mapping": self.lora_mapping,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-            "virtual_engine": self.virtual_engine,
-            "request_ids_to_seq_ids": self.request_ids_to_seq_ids,
-            "finished_requests_ids": self.finished_requests_ids,
-        }
-        _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
-        _add_sampling_metadata_broadcastable_dict(tensor_dict,
-                                                  self.sampling_metadata)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForGPUWithSamplingMetadata":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        return cls(**tensor_dict)
-
-
-class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
-    """Build ModelInputForGPU from SequenceGroupMetadata."""
-
-    # Note: ideally we would be using a dataclass(kw_only=True)
-    # here, so that this can be subclassed easily,
-    # but kw_only is not supported in python<3.10.
-    class InterDataForSeqGroup:
-        """Intermediate data for the current sequence group."""
-
-        def simple_reinit(self):
-            self.input_tokens[0].clear()  # type: ignore
-            self.inputs_embeds = None  # type: ignore
-            self.input_positions[0].clear()  # type: ignore
-            self.token_types[0].clear()  # type: ignore
-            self.mrope_input_positions = None  # type: ignore
-            self.seq_lens[0] = 0  # type: ignore
-            self.orig_seq_lens[0] = 0  # type: ignore
-            self.prompt_lens[0] = 0  # type: ignore
-            self.query_lens[0] = 0  # type: ignore
-            self.context_lens[0] = 0  # type: ignore
-            self.curr_sliding_window_blocks[0] = 0  # type: ignore
-            self.lora_index_mapping.clear()  # type: ignore
-            self.lora_prompt_mapping.clear()  # type: ignore
-            self.lora_requests.clear()  # type: ignore
-
-        def __init__(
-            self,
-            *,
-            # From sequence group metadata.
-            request_id: str,
-            seq_ids: List[int],
-            is_prompt: bool,
-            block_tables: Optional[Dict[int, List[int]]],
-            computed_block_nums: List[int],
-            n_seqs: int = 0,
-
-            # Input tokens and positions.
-            input_tokens: Optional[List[List[int]]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            input_positions: Optional[List[List[int]]] = None,
-            token_types: Optional[List[List[int]]] = None,
-            mrope_input_positions: Optional[List[List[List[int]]]] = None,
-
-            # The sequence length (may be capped to the sliding window).
-            seq_lens: Optional[List[int]] = None,
-            # The original sequence length (before applying sliding window).
-            # This is used to compute slot mapping.
-            orig_seq_lens: Optional[List[int]] = None,
-            # This is used in the dual-chunk flash attention backend.
-            prompt_lens: Optional[List[int]] = None,
-            # The query length.
-            query_lens: Optional[List[int]] = None,
-            # The number of tokens that are already computed.
-            context_lens: Optional[List[int]] = None,
-            # The current sliding window block.
-            curr_sliding_window_blocks: Optional[List[int]] = None,
-
-            # LoRA inputs.
-            lora_index_mapping: Optional[List[List[int]]] = None,
-            lora_prompt_mapping: Optional[List[List[int]]] = None,
-            lora_requests: Optional[Set[LoRARequest]] = None,
-
-            # Multi-modal inputs.
-            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
-            multi_modal_placeholder_maps: Optional[Dict[
-                str, MultiModalPlaceholderMap]] = None,
-
-            # Whether the prefix cache is hit (prefill only).
-            prefix_cache_hit: bool = False,
-            reinit: bool = False,
-            reinit_use_defaults: bool = False,
-            encoder_seq_len: int = 0,
-        ):
-            if reinit:
-                assert len(self.seq_ids) == len(seq_ids)  # type: ignore
-                for i, seq_id in enumerate(seq_ids):
-                    self.seq_ids[i] = seq_id  # type: ignore
-            else:
-                self.seq_ids = seq_ids
-
-            self.request_id = request_id
-            self.is_prompt = is_prompt
-            self.block_tables = block_tables
-            self.computed_block_nums = computed_block_nums
-            self.n_seqs = n_seqs
-            self.encoder_seq_len = encoder_seq_len
-
-            if reinit:
-                if len(self.seq_ids) == 1 and reinit_use_defaults:
-                    self.simple_reinit()
-                else:
-                    if input_tokens:
-                        self.input_tokens = input_tokens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.input_tokens[seq_id].clear()
-
-                    self.inputs_embeds = inputs_embeds
-
-                    if input_positions:
-                        self.input_positions = input_positions
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.input_positions[seq_id].clear()
-
-                    if token_types:
-                        self.token_types = token_types
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.token_types[seq_id].clear()
-
-                    self.mrope_input_positions = None
-
-                    if seq_lens:
-                        self.seq_lens = seq_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.seq_lens[seq_id] = 0
-
-                    if orig_seq_lens:
-                        self.orig_seq_lens = orig_seq_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.orig_seq_lens[seq_id] = 0
-
-                    if prompt_lens:
-                        self.prompt_lens = prompt_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.prompt_lens[seq_id] = 0
-
-                    if query_lens:
-                        self.query_lens = query_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.query_lens[seq_id] = 0
-
-                    if context_lens:
-                        self.context_lens = context_lens
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.context_lens[seq_id] = 0
-
-                    if curr_sliding_window_blocks:
-                        self.curr_sliding_window_blocks = \
-                            curr_sliding_window_blocks
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.curr_sliding_window_blocks[seq_id] = 0
-
-                    if lora_index_mapping:
-                        self.lora_index_mapping = lora_index_mapping
-                    else:
-                        self.lora_index_mapping.clear()
-
-                    if lora_prompt_mapping:
-                        self.lora_prompt_mapping = lora_prompt_mapping
-                    else:
-                        self.lora_prompt_mapping.clear()
-
-                    if lora_requests:
-                        self.lora_requests = lora_requests
-                    else:
-                        self.lora_requests.clear()
-
-            else:
-                self.input_tokens = input_tokens or []
-                self.inputs_embeds = inputs_embeds
-                self.input_positions = input_positions or []
-                self.token_types = token_types or []
-                self.mrope_input_positions = mrope_input_positions or None
-                self.seq_lens = seq_lens or []
-                self.orig_seq_lens = orig_seq_lens or []
-                self.prompt_lens = prompt_lens or []
-                self.query_lens = query_lens or []
-                self.context_lens = context_lens or []
-                self.curr_sliding_window_blocks = \
-                    curr_sliding_window_blocks or []
-
-                self.lora_index_mapping = lora_index_mapping or []
-                self.lora_prompt_mapping = lora_prompt_mapping or []
-                self.lora_requests = lora_requests or set()
-
-            self.multi_modal_kwargs = multi_modal_kwargs
-            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
-            self.prefix_cache_hit = prefix_cache_hit
-
-            self.n_seqs = len(self.seq_ids)
-
-            if not reinit:
-                self.__post_init__()
-
-        def __post_init__(self):
-            self.n_seqs = len(self.seq_ids)
-
-            self.input_tokens = [[] for _ in range(self.n_seqs)]
-            self.input_positions = [[] for _ in range(self.n_seqs)]
-            self.token_types = [[] for _ in range(self.n_seqs)]
-            self.mrope_input_positions = None
-            self.seq_lens = [0] * self.n_seqs
-            self.orig_seq_lens = [0] * self.n_seqs
-            self.prompt_lens = [0] * self.n_seqs
-            self.query_lens = [0] * self.n_seqs
-            self.context_lens = [0] * self.n_seqs
-            self.curr_sliding_window_blocks = [0] * self.n_seqs
-
-            self.lora_index_mapping = []
-            self.lora_prompt_mapping = []
-
-        def __repr__(self) -> str:
-            return (f"InterDataForSeqGroup("
-                    f"request_id={self.request_id}, "
-                    f"seq_ids={self.seq_ids}, "
-                    f"is_prompt={self.is_prompt}, "
-                    f"block_tables={self.block_tables}, "
-                    f"computed_block_nums={self.computed_block_nums}, "
-                    f"n_seqs={self.n_seqs}, "
-                    f"input_tokens={self.input_tokens}, "
-                    f"inputs_embeds.shape="
-                    f"{getattr(self.inputs_embeds, 'shape', None)}, "
-                    f"input_positions={self.input_positions}, "
-                    f"token_types={self.token_types}, "
-                    f"mrope_input_positions={self.mrope_input_positions}, "
-                    f"seq_lens={self.seq_lens}, "
-                    f"orig_seq_lens={self.orig_seq_lens}, "
-                    f"query_lens={self.query_lens}, "
-                    f"context_lens={self.context_lens}, "
-                    f"multi_modal_kwargs={self.multi_modal_kwargs}")
-
-    def gen_inter_data_builder(self, num_seqs: int):
-        return lambda: ModelInputForGPUBuilder.InterDataForSeqGroup(
-            request_id="",
-            seq_ids=[0] * num_seqs,
-            is_prompt=True,
-            block_tables=None,
-            computed_block_nums=[])
-
-    def init_cached_inter_data(self, *args, **kwargs):
-        assert len(args) == 0
-        assert "seq_ids" in kwargs
-        seq_ids = kwargs["seq_ids"]
-        num_seqs = len(seq_ids)
-
-        # The inter-data cache is per model_runner
-        inter_data_cache = self.runner.inter_data_cache
-        if num_seqs not in inter_data_cache:
-            inter_data_cache[num_seqs] = PyObjectCache(
-                self.gen_inter_data_builder(num_seqs))
-
-        obj = inter_data_cache[num_seqs].get_object()
-        obj.__init__(*args, **kwargs)
-        return obj
-
-    def reset_cached_inter_data(self):
-        for cache in self.runner.inter_data_cache.values():
-            cache.reset()
-
-    def __init__(self,
-                 runner: "GPUModelRunnerBase",
-                 finished_requests_ids: Optional[List[str]] = None):
-        super().__init__()
-        # Compute functions for each sequence in a sequence group.
-        # WARNING: The order of the functions matters!
-        self.per_seq_compute_fns = [
-            self._compute_lens,
-            self._compute_for_prefix_cache_hit,
-            self._compute_for_sliding_window,
-            self._compute_lora_input,
-        ]
-        # Compute functions for each sequence group.
-        # WARNING: The order of the functions matters!
-        self.per_seq_group_compute_fns = [
-            self._compute_multi_modal_input,
-        ]
-
-        self.runner = runner
-        self.model_input_cls = self.runner._model_input_cls
-        self.attn_backend = self.runner.attn_backend
-        self.scheduler_config = self.runner.scheduler_config
-        self.sliding_window = self.runner.sliding_window
-        self.block_size = self.runner.block_size
-        self.enable_lora = self.runner.lora_config is not None
-
-        # Attention metadata inputs.
-        if self.attn_backend is not None:
-            # spec decode (e.g. Medusa) does not have atten backend
-            self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
-                weakref.proxy(self))
-
-        # Engine/Model configurations.
-        self.chunked_prefill_enabled = (
-            self.scheduler_config is not None
-            and self.scheduler_config.chunked_prefill_enabled)
-        if self.sliding_window is not None:
-            self.sliding_window_blocks = (
-                self.sliding_window + self.block_size - 1) // self.block_size
-            self.block_aligned_sliding_window = \
-                self.sliding_window_blocks * self.block_size
-
-    def prepare(self,
-                finished_requests_ids: Optional[List[str]] = None) -> None:
-        self.finished_requests_ids = finished_requests_ids
-
-        # if the current batch is decode-only.
-        # will be set to False if there is any non-decode request.
-        self.decode_only = True
-
-        # Intermediate data (data in CPU before going to GPU) for
-        # the current sequence group.
-        self.inter_data_list: List[
-            ModelInputForGPUBuilder.InterDataForSeqGroup] = []
-
-        self.attn_metadata_builder.prepare()
-
-    def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
-                      seq_group_metadata: SequenceGroupMetadata):
-        """Compute context length, sequence length and tokens
-        for the given sequence data.
-        """
-        seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]]
-        token_chunk_size = seq_group_metadata.token_chunk_size
-
-        # Compute context length (the number of tokens that are
-        # already computed) and sequence length (total number of tokens).
-
-        seq_len = seq_data.get_len()
-        if inter_data.is_prompt:
-            context_len = seq_data.get_num_computed_tokens()
-            seq_len = min(seq_len, context_len + token_chunk_size)
-        elif self.runner.scheduler_config.is_multi_step or \
-            self.runner.model_config.is_encoder_decoder:
-            context_len = seq_len - 1
-        else:
-            context_len = seq_data.get_num_computed_tokens()
-
-        # Compute tokens.
-        if seq_data.prompt_embeds is None:
-            tokens = seq_data.get_token_ids()[context_len:seq_len]
-            prompt_embeds = None
-        else:
-            tokens = [0] * (seq_len - context_len)
-            prompt_embeds = seq_data.get_token_embeddings(
-            )[context_len:seq_len]
-
-        token_types = seq_group_metadata.token_type_ids
-
-        inter_data.seq_lens[seq_idx] = seq_len
-        inter_data.orig_seq_lens[seq_idx] = seq_len
-        inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len()
-        inter_data.context_lens[seq_idx] = context_len
-        inter_data.input_tokens[seq_idx].extend(tokens)
-        inter_data.inputs_embeds = prompt_embeds
-        inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
-        inter_data.token_types[seq_idx].extend(
-            token_types if token_types else [])
-        inter_data.query_lens[seq_idx] = seq_len - context_len
-
-        if seq_data.mrope_position_delta is not None:
-            if inter_data.mrope_input_positions is None:
-                inter_data.mrope_input_positions = [None] * inter_data.n_seqs
-
-            inter_data.mrope_input_positions[
-                seq_idx] = MRotaryEmbedding.get_next_input_positions(
-                    seq_data.mrope_position_delta,
-                    context_len,
-                    seq_len,
-                )
-
-    def _compute_for_prefix_cache_hit(
-            self, inter_data: InterDataForSeqGroup, seq_idx: int,
-            seq_group_metadata: SequenceGroupMetadata):
-        """Check if hit prefix cache (i.e., some blocks are already computed).
-        If hit, update input tokens and positions to only compute the
-        remaining blocks.
-        """
-        computed_block_nums = inter_data.computed_block_nums
-
-        # Note that prefix caching does not support sliding window.
-        prefix_cache_hit = (computed_block_nums is not None
-                            and len(computed_block_nums) > 0
-                            and self.sliding_window is None
-                            and inter_data.is_prompt)
-        inter_data.prefix_cache_hit = prefix_cache_hit
-
-        if not prefix_cache_hit:
-            return
-
-        assert computed_block_nums is not None
-        # The cache hit prompt tokens in this sequence. Note that
-        # this may be larger than the sequence length if chunked
-        # prefill is enabled.
-        prefix_cache_len = len(computed_block_nums) * self.block_size
-        seq_group_metadata.seq_data[inter_data.seq_ids[
-            seq_idx]].update_num_cached_tokens(prefix_cache_len)
-
-        # The number of so far computed prompt tokens in this sequence.
-        context_len = inter_data.context_lens[seq_idx]
-        # The total number of prompt tokens in this sequence.
-        # When chunked prefill is enabled, this is the token number of
-        # computed chunks + current chunk.
-        seq_len = inter_data.seq_lens[seq_idx]
-        if prefix_cache_len <= context_len:
-            # We already passed the cache hit region,
-            # so do normal computation.
-            pass
-        elif context_len < prefix_cache_len < seq_len:
-            # Partial hit. Compute the missing part.
-            uncomputed_start = prefix_cache_len - context_len
-            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
-                seq_idx][uncomputed_start:]
-            inter_data.input_positions[seq_idx] = inter_data.input_positions[
-                seq_idx][uncomputed_start:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                uncomputed_start:]
-            context_len = prefix_cache_len
-
-            inter_data.context_lens[seq_idx] = context_len
-            inter_data.query_lens[
-                seq_idx] = inter_data.seq_lens[seq_idx] - context_len
-        elif seq_len <= prefix_cache_len:
-            # Full hit. Only compute the last token to avoid
-            # erroneous behavior. FIXME: Ideally we should directly
-            # mark all tokens as computed in the scheduler and do not
-            # schedule this sequence, so this case should not happen.
-            inter_data.input_tokens[seq_idx] = inter_data.input_tokens[
-                seq_idx][-1:]
-            inter_data.input_positions[seq_idx] = inter_data.input_positions[
-                seq_idx][-1:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                -1:]
-            inter_data.query_lens[seq_idx] = 1
-            inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
-
-    def _compute_for_sliding_window(self, inter_data: InterDataForSeqGroup,
-                                    seq_idx: int,
-                                    seq_group_metadata: SequenceGroupMetadata):
-        """Update seq_len and curr_sliding_window_block for the given
-        sequence data (only required by decoding) if sliding window is enabled.
-        """
-        curr_sliding_window_block = 0
-        sliding_seq_len = inter_data.seq_lens[seq_idx]
-        if not inter_data.is_prompt and self.sliding_window is not None:
-            # TODO(sang): This is a hack to make sliding window work with
-            # paged attn. We can remove it if we make paged attn kernel
-            # to properly handle slinding window attn.
-            curr_sliding_window_block = self.sliding_window_blocks
-            # number of elements in last block
-            suff_len = inter_data.seq_lens[seq_idx] % self.block_size
-            sliding_seq_len = min(inter_data.seq_lens[seq_idx],
-                                  self.block_aligned_sliding_window + suff_len)
-            if suff_len > 0:
-                curr_sliding_window_block += 1
-
-        inter_data.curr_sliding_window_blocks[
-            seq_idx] = curr_sliding_window_block
-        inter_data.seq_lens[seq_idx] = sliding_seq_len
-
-    def _compute_lora_input(self, inter_data: InterDataForSeqGroup,
-                            seq_idx: int,
-                            seq_group_metadata: SequenceGroupMetadata):
-        """If LoRA is enabled, compute LoRA index and prompt mapping."""
-        if not self.enable_lora:
-            return
-
-        lora_id = seq_group_metadata.lora_int_id
-        if lora_id > 0:
-            inter_data.lora_requests.add(seq_group_metadata.lora_request)
-        query_len = inter_data.query_lens[seq_idx]
-        inter_data.lora_index_mapping.append([lora_id] * query_len)
-        sampling_params = seq_group_metadata.sampling_params
-        if sampling_params and sampling_params.prompt_logprobs is not None:
-            inter_data.lora_prompt_mapping.append([lora_id] * query_len)
-        elif not self.chunked_prefill_enabled or seq_group_metadata.do_sample:
-            inter_data.lora_prompt_mapping.append([lora_id])
-        else:
-            inter_data.lora_prompt_mapping.append([])
-
-    def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
-                                   seq_group_metadata: SequenceGroupMetadata):
-        """If multi-modal data is given, add it to the input."""
-        # NOTE: mm_kwargs only includes the subset of multi-modal items that
-        # intersect with the current prefill positions.
-        positions = inter_data.input_positions[0]
-        mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
-            seq_group_metadata,
-            range(positions[0], positions[0] + len(positions)))
-
-        # M-RoPE requires mrope_positions even for plain text; return early
-        # when mm_kwargs is empty only if inter_data.is_prompt is False.
-        if not mm_kwargs and not inter_data.is_prompt:
-            return
-
-        inter_data.multi_modal_kwargs = mm_kwargs
-        inter_data.multi_modal_placeholder_maps = placeholder_maps
-
-        # special processing for mrope position deltas.
-        if self.runner.model_config.uses_mrope:
-            image_grid_thw = mm_kwargs.get("image_grid_thw", None)
-            video_grid_thw = mm_kwargs.get("video_grid_thw", None)
-            audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
-                                                  None)
-
-            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
-            use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
-            hf_config = self.runner.model_config.hf_config
-
-            inter_data.mrope_input_positions = [None] * inter_data.n_seqs
-            for seq_idx in range(inter_data.n_seqs):
-                seq_data = seq_group_metadata.seq_data[
-                    inter_data.seq_ids[seq_idx]]
-                token_ids = seq_data.get_token_ids()
-
-                mrope_input_positions, mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions(
-                        token_ids,
-                        hf_config=hf_config,
-                        image_grid_thw=image_grid_thw,
-                        video_grid_thw=video_grid_thw,
-                        second_per_grid_ts=second_per_grid_ts,
-                        context_len=inter_data.context_lens[seq_idx],
-                        seq_len=inter_data.seq_lens[seq_idx],
-                        audio_feature_lengths=audio_feature_lengths,
-                        use_audio_in_video=use_audio_in_video,
-                    )
-
-                seq_data.mrope_position_delta = mrope_position_delta
-                inter_data.mrope_input_positions[
-                    seq_idx] = mrope_input_positions
-
-    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
-        """Add a sequence group to the builder."""
-        seq_ids = seq_group_metadata.seq_data.keys()
-        n_seqs = len(seq_ids)
-        is_prompt = seq_group_metadata.is_prompt
-
-        if is_prompt:
-            assert n_seqs == 1
-            self.decode_only = False
-
-        encoder_seq_len = 0
-
-        if self.runner.model_config.is_encoder_decoder:
-            encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
-
-        inter_data = self.init_cached_inter_data(
-            request_id=seq_group_metadata.request_id,
-            seq_ids=seq_ids,
-            is_prompt=is_prompt,
-            block_tables=seq_group_metadata.block_tables,
-            computed_block_nums=seq_group_metadata.computed_block_nums,
-            reinit=True,
-            reinit_use_defaults=True,
-            encoder_seq_len=encoder_seq_len)
-
-        self.inter_data_list.append(inter_data)
-
-        for seq_idx in range(n_seqs):
-            for per_seq_fn in self.per_seq_compute_fns:
-                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
-        for per_seq_group_fn in self.per_seq_group_compute_fns:
-            per_seq_group_fn(inter_data, seq_group_metadata)
-
-    def _use_captured_graph(self,
-                            batch_size: int,
-                            decode_only: bool,
-                            max_decode_seq_len: int,
-                            max_encoder_seq_len: int = 0) -> bool:
-        return (decode_only and not self.runner.model_config.enforce_eager
-                and max_decode_seq_len <= self.runner.max_seq_len_to_capture
-                and max_encoder_seq_len <= self.runner.max_seq_len_to_capture
-                and batch_size <= self.runner.max_batchsize_to_capture)
-
-    def _get_cuda_graph_pad_size(self,
-                                 num_seqs: int,
-                                 max_decode_seq_len: int,
-                                 max_encoder_seq_len: int = 0) -> int:
-        """
-        Determine the number of padding sequences required for running in
-        CUDA graph mode. Returns -1 if CUDA graphs cannot be used.
-
-        In the multi-step + chunked-prefill case, only the first step
-        has Prefills (if any). The rest of the steps are guaranteed to be all
-        decodes. In this case, we set up the padding as if all the sequences
-        are decodes so we may run all steps except the first step in CUDA graph
-        mode. The padding is accounted for in the multi-step `advance_step`
-        family of functions.
-
-        Args:
-            num_seqs (int): Number of sequences scheduled to run.
-            max_decode_seq_len (int): Greatest of all the decode sequence
-                lengths. Used only in checking the viablility of using
-                CUDA graphs.
-            max_encoder_seq_len (int, optional): Greatest of all the encode
-                sequence lengths. Defaults to 0. Used only in checking the
-                viability of using CUDA graphs.
-        Returns:
-            int: Returns the determined number of padding sequences. If
-                CUDA graphs is not viable, returns -1.
-        """
-        is_mscp: bool = self.runner.scheduler_config.is_multi_step and \
-                    self.runner.scheduler_config.chunked_prefill_enabled
-        decode_only = self.decode_only or is_mscp
-        if not decode_only:
-            # Early exit so we can treat num_seqs as the batch_size below.
-            return -1
-
-        # batch_size out of this function refers to the number of input
-        # tokens being scheduled. This conflation of num_seqs as batch_size
-        # is valid as this is a decode-only case.
-        batch_size = num_seqs
-        if not self._use_captured_graph(batch_size, decode_only,
-                                        max_decode_seq_len,
-                                        max_encoder_seq_len):
-            return -1
-
-        graph_batch_size = self.runner.vllm_config.pad_for_cudagraph(
-            batch_size)
-        assert graph_batch_size >= batch_size
-        return graph_batch_size - batch_size
-
-    def build(self) -> ModelInputForGPU:
-        """Finalize the builder intermediate data and
-        create on-device tensors.
-        """
-        # Combine and flatten intermediate data.
-        input_tokens = list[int]()
-        inputs_embeds_list = list[torch.Tensor]()
-        token_types = list[int]()
-        for inter_data in self.inter_data_list:
-            for cur_input_tokens in inter_data.input_tokens:
-                input_tokens.extend(cur_input_tokens)
-            for cur_token_types in inter_data.token_types:
-                token_types.extend(cur_token_types)
-            if inter_data.inputs_embeds is not None:
-                inputs_embeds_list.append(
-                    inter_data.inputs_embeds.to(
-                        dtype=self.runner.model_config.dtype,
-                        device=self.runner.device))
-        inputs_embeds: Optional[torch.Tensor]
-        if len(inputs_embeds_list) == 0:
-            inputs_embeds = None
-        else:
-            inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to(
-                dtype=self.runner.model_config.dtype,
-                device=self.runner.device)
-            assert len(inputs_embeds) == len(input_tokens)
-
-        if not input_tokens and inputs_embeds is None:
-            # This may happen when all prefill requests hit
-            # prefix caching and there is no decode request.
-            return self.model_input_cls()
-
-        mrope_input_positions: Optional[List[List[int]]] = None
-        if any(inter_data.mrope_input_positions is not None
-               for inter_data in self.inter_data_list):
-            mrope_input_positions = [[] for _ in range(3)]
-            for idx in range(3):
-                for inter_data in self.inter_data_list:
-                    msections = inter_data.mrope_input_positions
-                    if msections is None:
-                        for _seq_input_positions in inter_data.input_positions:
-                            mrope_input_positions[idx].extend(
-                                _seq_input_positions)
-                    else:
-                        for _seq_mrope_input_positions in msections:
-                            mrope_input_positions[idx].extend(
-                                _seq_mrope_input_positions[idx])
-            input_positions = None
-        else:
-            input_positions = []
-            for inter_data in self.inter_data_list:
-                for cur_input_positions in inter_data.input_positions:
-                    input_positions.extend(cur_input_positions)
-
-        seq_lens = []
-        query_lens = []
-        max_decode_seq_len = 0
-        max_encoder_seq_len = 0
-        for inter_data in self.inter_data_list:
-            seq_lens.extend(inter_data.seq_lens)
-            query_lens.extend(inter_data.query_lens)
-            if not inter_data.is_prompt:
-                max_decode_seq_len = max(max_decode_seq_len,
-                                         max(inter_data.seq_lens))
-                if self.runner.model_config.is_encoder_decoder:
-                    max_encoder_seq_len = max(max_encoder_seq_len,
-                                              inter_data.encoder_seq_len)
-
-        # Mapping from request IDs to sequence IDs. Used for Jamba models
-        # that manages the cache by itself.
-        request_ids_to_seq_ids = {
-            data.request_id: data.seq_ids
-            for data in self.inter_data_list
-        }
-
-        cuda_graph_pad_size = self._get_cuda_graph_pad_size(
-            num_seqs=len(seq_lens),
-            max_decode_seq_len=max_decode_seq_len,
-            max_encoder_seq_len=max_encoder_seq_len)
-
-        batch_size = len(input_tokens)
-        if cuda_graph_pad_size != -1:
-            # If cuda graph can be used, pad tensors accordingly.
-            # See `capture_model` API for more details.
-            # vLLM uses cuda graph only for decoding requests.
-            batch_size += cuda_graph_pad_size
-
-        # Tokens and positions.
-        if cuda_graph_pad_size:
-            input_tokens.extend(itertools.repeat(0, cuda_graph_pad_size))
-        assert self.runner.device is not None
-        input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long,
-                                               self.runner.device,
-                                               self.runner.pin_memory)
-
-        token_types_tensor = async_tensor_h2d(token_types, torch.long,
-                                               self.runner.device,
-                                               self.runner.pin_memory) \
-                                                if token_types else None
-
-        if mrope_input_positions is not None:
-            for idx in range(3):
-                mrope_input_positions[idx].extend(
-                    itertools.repeat(0, cuda_graph_pad_size))
-            input_positions_tensor = async_tensor_h2d(mrope_input_positions,
-                                                      torch.long,
-                                                      self.runner.device,
-                                                      self.runner.pin_memory)
-        else:
-            input_positions.extend(itertools.repeat(0, cuda_graph_pad_size))
-            input_positions_tensor = async_tensor_h2d(input_positions,
-                                                      torch.long,
-                                                      self.runner.device,
-                                                      self.runner.pin_memory)
-        # Sequence and query lengths.
-        if cuda_graph_pad_size:
-            seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
-
-        # Attention metadata.
-        attn_metadata = self.attn_metadata_builder.build(
-            seq_lens, query_lens, cuda_graph_pad_size, batch_size)
-
-        # LoRA data.
-        lora_requests = set()
-        lora_mapping = None
-        if self.enable_lora:
-            lora_requests = set(r for data in self.inter_data_list
-                                for r in data.lora_requests)
-            lora_index_mapping = flatten_2d_lists([
-                flatten_2d_lists(inter_data.lora_index_mapping)
-                for inter_data in self.inter_data_list
-            ])
-            if cuda_graph_pad_size:
-                lora_index_mapping.extend(
-                    itertools.repeat(0, cuda_graph_pad_size))
-            lora_prompt_mapping = flatten_2d_lists([
-                flatten_2d_lists(inter_data.lora_prompt_mapping)
-                for inter_data in self.inter_data_list
-            ])
-
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=lora_index_mapping,
-                       prompt_mapping=lora_prompt_mapping,
-                       is_prefill=not self.decode_only))
-
-        # Multi-modal data.
-        multi_modal_kwargs_list = [
-            data.multi_modal_kwargs for data in self.inter_data_list
-            if data.multi_modal_kwargs is not None
-        ]
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return self.model_input_cls(
-            input_tokens=input_tokens_tensor,
-            inputs_embeds=inputs_embeds,
-            input_positions=input_positions_tensor,
-            token_types=token_types_tensor,
-            attn_metadata=attn_metadata,
-            seq_lens=seq_lens,
-            query_lens=query_lens,
-            lora_mapping=lora_mapping,
-            lora_requests=lora_requests,
-            multi_modal_kwargs=multi_modal_kwargs,
-            request_ids_to_seq_ids=request_ids_to_seq_ids,
-            finished_requests_ids=self.finished_requests_ids)
-
-
-class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
-    """
-    Helper class for shared methods between GPU model runners.
-    """
-    _model_input_cls: Type[TModelInputForGPU]
-    _builder_cls: Type[ModelInputForGPUBuilder]
-    builder: ModelInputForGPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        return_hidden_states: bool = False,
-        input_registry: InputRegistry = INPUT_REGISTRY,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-
-        ModelRunnerBase.__init__(self, vllm_config)
-        model_config = self.model_config
-        cache_config = self.cache_config
-
-        self.is_driver_worker = is_driver_worker
-        self.return_hidden_states = return_hidden_states
-
-        self.device = self.device_config.device
-        self.pin_memory = is_pin_memory_available()
-
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
-        self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture
-        self.max_batchsize_to_capture = \
-            self.vllm_config.compilation_config.max_capture_size
-
-        #
-        self.graph_runners: List[Dict[Tuple[int, bool], CUDAGraphRunner]] = [
-            {} for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        self.graph_memory_pool: Optional[Tuple[
-            int, int]] = None  # Set during graph capture.
-
-        self.has_inner_state = model_config.has_inner_state
-
-        self.in_profile_run = False
-
-        # When using CUDA graph, the input block tables must be padded to
-        # max_seq_len_to_capture. However, creating the block table in
-        # Python can be expensive. To optimize this, we cache the block table
-        # in numpy and only copy the actual input content at every iteration.
-        # The shape of the cached block table will be
-        # (max batch size to capture, max seq len to capture / block size).
-        self.graph_block_tables = np.zeros(
-            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
-            dtype=np.int32)
-
-        self.cross_layer_shared_graph_block_tables = np.zeros(
-            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
-            dtype=np.int32)
-
-        # Attention-free but stateful models like Mamba need a placeholder attn
-        # backend, as the attention metadata is needed to manage internal state.
-        # However we must bypass attention selection altogether for some models
-        # used for speculative decoding to avoid a divide-by-zero in
-        # model_config.get_head_size()
-        num_attn_heads = self.model_config.get_num_attention_heads(
-            self.parallel_config)
-        needs_attn_backend = (num_attn_heads != 0
-                              or self.model_config.is_attention_free)
-
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.kv_cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-            use_mla=self.model_config.use_mla,
-        ) if needs_attn_backend else None
-        if self.attn_backend:
-            self.attn_state = self.attn_backend.get_state_cls()(
-                weakref.proxy(self))
-        else:
-            self.attn_state = CommonAttentionState(weakref.proxy(self))
-
-        # Multi-modal data support
-        self.input_registry = input_registry
-        self.mm_registry = mm_registry
-
-        # Lazy initialization
-        self.model: nn.Module  # Set after load_model
-        # Set after load_model.
-        self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None
-        self.sampler = get_sampler()
-
-        set_cpu_offload_max_bytes(
-            int(self.cache_config.cpu_offload_gb * 1024**3))
-
-        # Used to cache python objects
-        self.inter_data_cache: Dict[int, PyObjectCache] = {}
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceGroupToSample object. In Pipeline-Parallel, we have
-        # more than 1 Scheduler, resulting in a potential back-to-back
-        # prepare_model_inputs() call. This clobbers the cached
-        # SequenceGroupToSample objects, as we reset the cache during
-        # every prepare_model_inputs() call.
-        self.sampling_metadata_cache: SamplingMetadataCache = \
-              SamplingMetadataCache() \
-                if self.parallel_config.pipeline_parallel_size == 1 else None
-
-        if hasattr(self, "_builder_cls"):
-            # multi-step model runner does not have `_builder_cls`
-            self.builder = self._builder_cls(weakref.proxy(self))
-
-    def load_model(self) -> None:
-        logger.info("Starting to load model %s...", self.model_config.model)
-        with DeviceMemoryProfiler(self.device) as m:
-            time_before_load = time.perf_counter()
-            self.model = get_model(vllm_config=self.vllm_config)
-            if self.lora_config:
-                assert supports_lora(
-                    self.model
-                ), f"{self.model.__class__.__name__} does not support LoRA yet."
-
-                if supports_multimodal(self.model):
-                    logger.warning(
-                        "Regarding multimodal models, vLLM currently "
-                        "only supports adding LoRA to language model.")
-
-                # Use get_text_config() in case of multimodal models
-                text_config = self.model_config.hf_config.get_text_config()
-
-                self.lora_manager = LRUCacheWorkerLoRAManager(
-                    self.scheduler_config.max_num_seqs,
-                    self.scheduler_config.max_num_batched_tokens,
-                    self.vocab_size,
-                    self.lora_config,
-                    self.device,
-                    self.model.embedding_modules,
-                    self.model.embedding_padding_modules,
-                    max_position_embeddings=text_config.
-                    max_position_embeddings,
-                )
-                self.model = self.lora_manager.create_lora_manager(self.model)
-            time_after_load = time.perf_counter()
-
-        self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GiB and %.6f seconds",
-                    self.model_memory_usage / GiB_bytes,
-                    time_after_load - time_before_load)
-
-
-        if self.vllm_config.compilation_config.level ==\
-            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
-            backend = self.vllm_config.compilation_config.init_backend(
-                self.vllm_config)
-            self.model = torch.compile(
-                self.model,
-                fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,
-                backend=backend)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        from vllm.model_executor.model_loader import ShardedStateLoader
-        ShardedStateLoader.save_model(
-            self.model,
-            path,
-            pattern=pattern,
-            max_size=max_size,
-        )
-
-    def save_tensorized_model(
-        self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-        from vllm.model_executor.model_loader import TensorizerLoader
-        TensorizerLoader.save_model(
-            self.model,
-            tensorizer_config=tensorizer_config,
-            model_config=self.model_config,
-        )
-
-    def get_max_block_per_batch(self) -> int:
-        block_size = self.block_size
-        return (self.max_seq_len_to_capture + block_size - 1) // block_size
-
-    def _prepare_model_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> TModelInputForGPU:
-        """Helper method to prepare the model input based on a given sequence
-        group. Prepares metadata needed for the base model forward pass but not
-        metadata for possible additional steps, e.g., sampling.
-
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-
-        If cuda graph is required, this API automatically pads inputs.
-        """
-        self.builder.prepare(finished_requests_ids)
-        for seq_group_metadata in seq_group_metadata_list:
-            try:
-                self.builder.add_seq_group(seq_group_metadata)
-            except Exception as e:
-                # Raise an exception that tracks the ID of the bad request
-                raise InputProcessingError(seq_group_metadata.request_id,
-                                           str(e)) from e
-
-        self.builder.reset_cached_inter_data()
-
-        return self.builder.build()  # type: ignore
-
-    @contextmanager
-    def set_in_profile_run(self):
-        self.in_profile_run = True
-        try:
-            yield
-        finally:
-            self.in_profile_run = False
-
-    @torch.inference_mode()
-    def profile_run(self) -> None:
-        max_num_batched_tokens = \
-            self.scheduler_config.max_num_batched_tokens
-        max_num_seqs = self.scheduler_config.max_num_seqs
-        self._dummy_run(max_num_batched_tokens, max_num_seqs)
-
-    def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]:
-        assert num_loras > 0
-        assert self.lora_manager is not None
-
-        dummy_lora_requests: list[LoRARequest] = []
-        with self.lora_manager.dummy_lora_cache():
-            for idx in range(num_loras):
-                lora_id = idx + 1
-                dummy_lora_request = LoRARequest(
-                    lora_name=f"warmup_{lora_id}",
-                    lora_int_id=lora_id,
-                    lora_path="/not/a/real/path",
-                )
-                self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                 rank=LORA_WARMUP_RANK)
-                dummy_lora_requests.append(dummy_lora_request)
-        return dummy_lora_requests
-
-    def _remove_dummy_loras(self):
-        # Remove dummy loras.
-        assert self.lora_manager is not None
-        self.remove_all_loras()
-
-    def _dummy_run(self,
-                   max_num_batched_tokens: int,
-                   max_num_seqs: int = 1) -> None:
-        with self.set_in_profile_run():
-            # Enable top-k sampling to reflect the accurate memory usage.
-            sampling_params = \
-                SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
-
-            # This represents the maximum number of different requests
-            # that will have unique loras, and therefore the max amount of
-            # memory consumption. Create dummy lora request copies from the
-            # lora request passed in, which contains a lora from the lora
-            # warmup path.
-            dummy_lora_requests: List[LoRARequest] = []
-            dummy_lora_requests_per_seq: List[LoRARequest] = []
-            if self.lora_config:
-                dummy_lora_requests = self._add_dummy_loras(
-                    self.lora_config.max_loras)
-                assert len(dummy_lora_requests) == self.lora_config.max_loras
-                dummy_lora_requests_per_seq = [
-                    dummy_lora_requests[idx % len(dummy_lora_requests)]
-                    for idx in range(max_num_seqs)
-                ]
-
-            # Profile memory usage with max_num_sequences sequences and the
-            # total number of tokens equal to max_num_batched_tokens.
-            seqs: List[SequenceGroupMetadata] = []
-            # Additional GPU memory may be needed for multi-modal encoding,
-            # which needs to be accounted for when calculating the GPU blocks
-            # for vLLM blocker manager.
-            # To exercise the worst scenario for GPU memory consumption,
-            # the number of seqs (batch_size) is chosen to maximize the number
-            # of images processed.
-
-            max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
-                self.model_config)
-            if max_mm_tokens > 0:
-                max_num_seqs_orig = max_num_seqs
-                max_num_seqs = min(max_num_seqs,
-                                   max_num_batched_tokens // max_mm_tokens)
-                if max_num_seqs < 1:
-                    expr = (f"min({max_num_seqs_orig}, "
-                            f"{max_num_batched_tokens} // {max_mm_tokens})")
-                    logger.warning(
-                        "Computed max_num_seqs (%s) to be less than 1. "
-                        "Setting it to the minimum value of 1.", expr)
-                    max_num_seqs = 1
-
-            batch_size = 0
-            for group_id in range(max_num_seqs):
-                seq_len = (max_num_batched_tokens // max_num_seqs +
-                           (group_id < max_num_batched_tokens % max_num_seqs))
-                batch_size += seq_len
-
-                dummy_data = self.input_registry \
-                    .dummy_data_for_profiling(self.model_config,
-                                              seq_len,
-                                              self.mm_registry)
-
-                seq = SequenceGroupMetadata(
-                    request_id=str(group_id),
-                    is_prompt=True,
-                    seq_data={group_id: dummy_data.seq_data},
-                    sampling_params=sampling_params,
-                    block_tables=None,
-                    lora_request=dummy_lora_requests_per_seq[group_id]
-                    if dummy_lora_requests_per_seq else None,
-                    multi_modal_data=dummy_data.multi_modal_data,
-                    multi_modal_placeholders=dummy_data.
-                    multi_modal_placeholders,
-                )
-                seqs.append(seq)
-
-            # Run the model with the dummy inputs.
-            num_layers = self.model_config.get_num_layers(self.parallel_config)
-            # use an empty tensor instead of `None`` to force Dynamo to pass
-            # it by reference, rather by specializing on the value ``None``.
-            # the `dtype` argument does not matter, and we use `float32` as
-            # a placeholder (it has wide hardware support).
-            # it is important to create tensors inside the loop, rather than
-            # multiplying the list, to avoid Dynamo from treating them as
-            # tensor aliasing.
-            kv_caches = [
-                torch.tensor([], dtype=torch.float32, device=self.device)
-                for _ in range(num_layers)
-            ]
-            finished_requests_ids = [seq.request_id for seq in seqs]
-            model_input = self.prepare_model_input(
-                seqs, finished_requests_ids=finished_requests_ids)
-            intermediate_tensors = None
-            if not get_pp_group().is_first_rank:
-                intermediate_tensors = \
-                    self.model.make_empty_intermediate_tensors(
-                    batch_size=batch_size,
-                    dtype=self.model_config.dtype,
-                    device=self.device)
-
-            # Disable KV Scale Calculation for dummy data during profile run
-            if model_input.attn_metadata is not None:
-                model_input.attn_metadata.enable_kv_scales_calculation = False
-
-            self.execute_model(model_input, kv_caches, intermediate_tensors)
-            torch.cuda.synchronize()
-            if self.lora_config:
-                self._remove_dummy_loras()
-
-            return
-
-    def remove_all_loras(self):
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.remove_all_adapters()
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_adapter(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.remove_adapter(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.pin_adapter(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if not self.lora_manager:
-            raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_adapters()
-
-    @torch.inference_mode()
-    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
-        """Cuda graph capture a model.
-
-        Note that CUDA graph's performance gain is negligible if number
-        of batched tokens are larger than 200. And since CUDA graph
-        requires fixed sized tensors, supporting large/variable batch
-        size requires high GPU memory overhead. Thus, vLLM only captures
-        decoding requests. Mixed batch (chunked prefill + decoding) or
-        prefill requests are not captured.
-
-        Since it is used for decoding-only, it assumes there's only 1 token
-        per sequence in the batch.
-        """
-        assert not self.model_config.enforce_eager
-        logger.info("Capturing cudagraphs for decoding. This may lead to "
-                    "unexpected consequences if the model is not static. To "
-                    "run the model in eager mode, set 'enforce_eager=True' or "
-                    "use '--enforce-eager' in the CLI. "
-                    "If out-of-memory error occurs during cudagraph capture,"
-                    " consider decreasing `gpu_memory_utilization` or "
-                    "switching to eager mode. You can also reduce the "
-                    "`max_num_seqs` as needed to decrease memory usage.")
-        start_time = time.perf_counter()
-        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
-
-        # Prepare dummy inputs. These will be reused for all batch sizes.
-        max_batch_size = self.max_batchsize_to_capture
-        input_tokens = torch.zeros(max_batch_size,
-                                   dtype=torch.long,
-                                   device=self.device)
-        input_positions = torch.zeros(max_batch_size,
-                                      dtype=torch.long,
-                                      device=self.device)
-        inputs_embeds = torch.zeros(
-            (max_batch_size, self.model_config.get_hidden_size()),
-            dtype=self.model_config.dtype,
-            device=self.device)
-        if self.model_config.uses_mrope:
-            input_positions = torch.tile(input_positions,
-                                         (3, 1)).cuda(device=self.device)
-        # Prepare dummy previous_hidden_states only if needed by the model.
-        # This is used by draft models such as EAGLE.
-        previous_hidden_states = None
-        if "previous_hidden_states" in inspect.signature(
-                self.model.forward).parameters:
-            previous_hidden_states = torch.empty(
-                [max_batch_size,
-                 self.model_config.get_hidden_size()],
-                dtype=self.model_config.dtype,
-                device=self.device)
-
-        intermediate_inputs = None
-        if not get_pp_group().is_first_rank:
-            intermediate_inputs = self.model.make_empty_intermediate_tensors(
-                batch_size=max_batch_size,
-                dtype=self.model_config.dtype,
-                device=self.device)
-
-        dummy_lora_id: Optional[int] = None
-        dummy_lora_request: LoRARequest = []
-        if self.lora_config:
-            # The goal is to capture the LoRA kernels in cuda graphs.
-            # for this purpose, as single dummy lora is sufficient.
-            dummy_lora_requests = self._add_dummy_loras(num_loras=1)
-            assert len(dummy_lora_requests) == 1
-            dummy_lora_request = dummy_lora_requests[0]
-            dummy_lora_id = dummy_lora_request.lora_int_id
-
-        with self.attn_state.graph_capture(max_batch_size), graph_capture(
-                self.device) as graph_capture_context:
-            # NOTE: Capturing the largest batch size first may help reduce the
-            # memory usage of CUDA graph.
-            for virtual_engine in range(
-                    self.parallel_config.pipeline_parallel_size):
-                # We need to not only iterate over batch sizes, but also whether
-                # to use inputs_embeds or not, hence we use the cartesian
-                # product.
-                cudagraph_capture_sizes = self.vllm_config.compilation_config\
-                    .cudagraph_capture_sizes
-                cudagraph_inputs_embeds = ((
-                    True, False) if self.model_config.enable_prompt_embeds else
-                                           (False, ))
-                compilation_cases = itertools.product(
-                    cudagraph_capture_sizes,
-                    cudagraph_inputs_embeds,
-                )
-                # Only rank 0 should print progress bar during capture
-                if get_tensor_model_parallel_rank() == 0:
-                    compilation_cases = tqdm(
-                        list(compilation_cases),
-                        disable=not self.load_config.use_tqdm_on_load,
-                        desc="Capturing CUDA graph shapes")
-                for batch_size, use_inputs_embeds in compilation_cases:
-                    attn_metadata = (
-                        self.attn_state.graph_capture_get_metadata_for_batch(
-                            batch_size,
-                            is_encoder_decoder_model=self.model_config.
-                            is_encoder_decoder))
-                    # Disable KV Scale Calculation for graph capture
-                    attn_metadata.enable_kv_scales_calculation = False
-                    if self.lora_config:
-                        lora_mapping = LoRAMapping(
-                            **dict(index_mapping=[dummy_lora_id] * batch_size,
-                                   prompt_mapping=[dummy_lora_id] * batch_size,
-                                   is_prefill=False))
-                        self.set_active_loras(set([dummy_lora_request]),
-                                              lora_mapping)
-
-                    graph_runner = CUDAGraphRunner(
-                        self.model, self.attn_backend.get_name(),
-                        self.attn_state.graph_clone(batch_size),
-                        self.model_config.is_encoder_decoder)
-
-                    capture_inputs = {
-                        "input_ids":
-                        input_tokens[:batch_size],
-                        "inputs_embeds":
-                        inputs_embeds[:batch_size]
-                        if use_inputs_embeds else None,
-                        "positions":
-                        input_positions[..., :batch_size],
-                        "intermediate_inputs":
-                        intermediate_inputs[:batch_size]
-                        if intermediate_inputs is not None else None,
-                        "kv_caches":
-                        kv_caches[virtual_engine],
-                        "attn_metadata":
-                        attn_metadata,
-                        "memory_pool":
-                        self.graph_memory_pool,
-                        "stream":
-                        graph_capture_context.stream
-                    }
-                    if previous_hidden_states is not None:
-                        capture_inputs[
-                            "previous_hidden_states"] = previous_hidden_states[:
-                                                                               batch_size]
-
-                    if self.has_inner_state:
-                        # Only used by Mamba-based models CUDA graph atm (Jamba)
-                        capture_inputs.update({
-                            "seqlen_agnostic_capture_inputs":
-                            self.model.get_seqlen_agnostic_capture_inputs(
-                                batch_size)
-                        })
-                    if self.model_config.is_encoder_decoder:
-                        # add the additional inputs to capture for
-                        # encoder-decoder models.
-                        self._update_inputs_to_capture_for_enc_dec_model(
-                            capture_inputs)
-
-                    with set_forward_context(attn_metadata, self.vllm_config,
-                                             virtual_engine):
-                        graph_runner.capture(**capture_inputs)
-                    self.graph_memory_pool = graph_runner.graph.pool()
-                    self.graph_runners[virtual_engine][(
-                        batch_size, use_inputs_embeds)] = graph_runner
-
-        if self.lora_config:
-            self._remove_dummy_loras()
-
-        end_time = time.perf_counter()
-        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
-        elapsed_time = end_time - start_time
-        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
-        # This usually takes < 10 seconds.
-        logger.info("Graph capturing finished in %.0f secs, took %.2f GiB",
-                    elapsed_time, cuda_graph_size / GiB_bytes)
-
-    def _update_inputs_to_capture_for_enc_dec_model(self,
-                                                    capture_inputs: Dict[str,
-                                                                         Any]):
-        """
-        Updates the set of input tensors needed for CUDA graph capture in an
-        encoder-decoder model.
-
-        This method modifies the provided `capture_inputs` dictionary by
-        adding tensors specific to encoder-decoder specific models that
-        need to be captured for CUDA Graph replay.
-        """
-        # During the decode phase encoder_input_ids and encoder_positions are
-        # unset. Do the same thing for graph capture.
-        capture_inputs["encoder_input_ids"] = torch.tensor([],
-                                                           dtype=torch.long,
-                                                           device=self.device)
-        capture_inputs["encoder_positions"] = torch.tensor([],
-                                                           dtype=torch.long,
-                                                           device=self.device)
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-
-class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
-    """
-    GPU model runner with sampling step.
-    """
-    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
-        ModelInputForGPUWithSamplingMetadata)
-    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
-
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        model_input = \
-            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
-                tensor_dict,
-                attn_backend=self.attn_backend,
-            )
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None,
-    ) -> ModelInputForGPUWithSamplingMetadata:
-        """Prepare the model input based on a given sequence group, including
-        metadata for the sampling step.
-
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-
-        If cuda graph is required, this API automatically pads inputs.
-        """
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        if get_pp_group().is_last_rank:
-            # Sampling metadata is only required for the final pp group
-            generators = self.get_generators(finished_requests_ids)
-            sampling_metadata = SamplingMetadata.prepare(
-                seq_group_metadata_list, model_input.seq_lens,
-                model_input.query_lens, self.device, self.pin_memory,
-                generators, self.sampling_metadata_cache)
-        else:
-            sampling_metadata = None
-        is_prompt = (seq_group_metadata_list[0].is_prompt
-                     if seq_group_metadata_list else None)
-        return dataclasses.replace(model_input,
-                                   sampling_metadata=sampling_metadata,
-                                   is_prompt=is_prompt,
-                                   virtual_engine=virtual_engine)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-        **kwargs,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError("num_steps > 1 is not supported in ModelRunner")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        self.attn_state.begin_forward(model_input)
-
-        # Currently cuda graph is only supported by the decode phase.
-        assert model_input.attn_metadata is not None
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        decode_meta = model_input.attn_metadata.decode_metadata
-        # TODO(andoorve): We can remove this once all
-        # virtual engines share the same kv cache.
-        virtual_engine = model_input.virtual_engine
-        previous_hidden_states = kwargs.get("previous_hidden_states")
-        if prefill_meta is None and decode_meta.use_cuda_graph:
-            assert model_input.input_tokens is not None
-            graph_batch_size = model_input.input_tokens.shape[0]
-            use_inputs_embeds = model_input.inputs_embeds is not None
-            model_executable = self.graph_runners[virtual_engine][(
-                graph_batch_size, use_inputs_embeds)]
-            if previous_hidden_states is not None:
-                previous_hidden_states = torch.cat([
-                    previous_hidden_states,
-                    torch.empty([
-                        graph_batch_size - previous_hidden_states.shape[0],
-                        *previous_hidden_states.shape[1:]
-                    ],
-                                dtype=previous_hidden_states.dtype,
-                                device=previous_hidden_states.device)
-                ])
-        else:
-            model_executable = self.model
-
-        # Receive KV cache in distributed KV cache transfer setting
-        # In disagg prefill setting, it will also recv hidden states and bypass
-        # model forwarding
-        # In KV cache database setting, it will change the model input so that
-        # we can skip prefilling on tokens that successfully received KV caches
-        # NOTE: The receive operation is blocking
-        bypass_model_exec = False
-        if self.need_recv_kv(model_input, kv_caches):
-            hidden_or_intermediate_states, bypass_model_exec, model_input = \
-                get_kv_transfer_group().recv_kv_caches_and_hidden_states(
-                    # model is used to know which layer the current worker
-                    # is working on, so that we can receive KV for only those
-                    # layers.
-                    model_executable,
-                    model_input,
-                    kv_caches=kv_caches
-                )
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-        model_kwargs = {}
-        if previous_hidden_states is not None:
-            model_kwargs["previous_hidden_states"] = previous_hidden_states
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.cuda.Event(enable_timing=True)
-            model_forward_end = torch.cuda.Event(enable_timing=True)
-            model_forward_start.record()
-
-        if not bypass_model_exec:
-            with set_forward_context(model_input.attn_metadata,
-                                     self.vllm_config, virtual_engine):
-                hidden_or_intermediate_states = model_executable(
-                    input_ids=model_input.input_tokens,
-                    inputs_embeds=model_input.inputs_embeds,
-                    positions=model_input.input_positions,
-                    intermediate_tensors=intermediate_tensors,
-                    **MultiModalKwargs.as_kwargs(
-                        multi_modal_kwargs,
-                        device=self.device,
-                    ),
-                    **seqlen_agnostic_kwargs,
-                    **model_kwargs,
-                )
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end.record()
-
-        # Sending KV cache in distributed KV cache transfer setting
-        # NOTE: the send operation is non-blocking
-        if self.need_send_kv(model_input, kv_caches):
-            get_kv_transfer_group().send_kv_caches_and_hidden_states(
-                # model_executable is used to know which layer the current
-                # worker is working on, so that we can send KV for only those
-                # layers.
-                model_executable,
-                model_input,
-                kv_caches,
-                hidden_or_intermediate_states,
-            )
-
-        # Compute the logits in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            if (self.is_driver_worker
-                    and hidden_or_intermediate_states is not None
-                    and isinstance(hidden_or_intermediate_states,
-                                   IntermediateTensors)
-                    and self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                hidden_or_intermediate_states.tensors["model_forward_time"] = (
-                    torch.tensor(model_forward_time + orig_model_forward_time))
-            return hidden_or_intermediate_states
-
-        logits = self.model.compute_logits(hidden_or_intermediate_states,
-                                           model_input.sampling_metadata)
-
-        if self.is_driver_worker:
-            if model_input.async_callback is not None:
-                model_input.async_callback()
-
-            # Sample the next token.
-            assert isinstance(self.sampler, Sampler)
-            orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor
-            if model_input.inputs_embeds is not None:
-                self.sampler.include_gpu_probs_tensor = True
-
-            output: SamplerOutput = self.sampler(
-                logits=logits,
-                sampling_metadata=model_input.sampling_metadata,
-            )
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time
-                    and output is not None):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                # If there are multiple workers, we are still tracking the
-                # latency from the start time of the driver worker to the end
-                # time of the driver worker. The model forward time will then
-                # end up covering the communication time as well.
-                output.model_forward_time = (orig_model_forward_time +
-                                             model_forward_time)
-
-        if model_input.inputs_embeds is not None:
-            if self.is_driver_worker:
-                sampled_token_ids = []
-                valid_outputs = []
-                for sequence_group_output in output.outputs:
-                    if len(sequence_group_output.samples) == 0:
-                        continue
-                    assert len(sequence_group_output.samples) == 1
-                    valid_outputs.append(sequence_group_output)
-                    sampled_token_ids.append(
-                        sequence_group_output.samples[0].output_token)
-                sampled_token_ids = torch.tensor(sampled_token_ids).to(
-                    self.device)
-                sampled_token_ids = broadcast_tensor_dict(
-                    {"sampled_token_ids":
-                     sampled_token_ids})["sampled_token_ids"]
-            else:
-                sampled_token_ids = broadcast_tensor_dict(
-                )["sampled_token_ids"]
-            if len(sampled_token_ids) > 0:
-                sampled_token_embeds = \
-                    self.model.get_input_embeddings(sampled_token_ids)
-                if self.is_driver_worker:
-                    self.sampler.include_gpu_probs_tensor = \
-                        orig_include_gpu_probs
-                    for i, sequence_group_output in enumerate(valid_outputs):
-                        sequence_group_output.samples[0].output_embed = \
-                            sampled_token_embeds[i]
-
-        if not self.is_driver_worker:
-            return []
-
-        if self.return_hidden_states:
-            # we only need to pass hidden states of most recent token
-            assert model_input.sampling_metadata is not None
-            indices = model_input.sampling_metadata.selected_token_indices
-            if model_input.is_prompt:
-                hidden_states = hidden_or_intermediate_states.index_select(
-                    0, indices)
-                output.prefill_hidden_states = hidden_or_intermediate_states
-            elif decode_meta.use_cuda_graph:
-                hidden_states = hidden_or_intermediate_states[:len(indices)]
-            else:
-                hidden_states = hidden_or_intermediate_states
-
-            output.hidden_states = hidden_states
-
-        return [output]
-
-    def need_recv_kv(self, model_input, kv_caches) -> bool:
-        """Check if we need to receive kv-cache from the other worker.
-        We need to receive KV when
-            1. current vLLM instance is KV cache consumer/decode vLLM instance
-            2. this batch is not a profiling run
-            3. this batch is a prefill run
-
-        Args:
-            model_input: input to the model executable
-            kv_caches: vLLM's paged memory
-        """
-
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches[0].numel() == 0)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-
-        return self.vllm_config.kv_transfer_config.is_kv_consumer and (
-            not is_profile_run) and is_prefill_run
-
-    def need_send_kv(self, model_input, kv_caches) -> bool:
-        """Check if we need to send kv-cache to the other worker.
-        We need to send KV when
-            1. current vLLM instance is KV cache producer/prefill vLLM instance
-            2. this batch is not a profiling run
-            3. this batch is a prefill run
-
-        Args:
-            model_input: input to the model executable
-            kv_caches: vLLM's paged memory
-        """
-
-        if self.vllm_config.kv_transfer_config is None:
-            return False
-
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches[0].numel() == 0)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-
-        return self.vllm_config.kv_transfer_config.is_kv_producer and (
-            not is_profile_run) and is_prefill_run
-
-
-# NOTE: this is nn.Module so the profiler can properly capture/group
-#  kernels calls made within the graph
-class CUDAGraphRunner(nn.Module):
-
-    def __init__(self, model: nn.Module, backend_name: str,
-                 attn_state: AttentionState, is_encoder_decoder_model: bool):
-        super().__init__()
-        self.model = model
-        self.backend_name = backend_name
-        self.attn_state = attn_state
-
-        self.input_buffers: Dict[str, torch.Tensor] = {}
-        self.output_buffers: Dict[str, torch.Tensor] = {}
-
-        self._graph: Optional[torch.cuda.CUDAGraph] = None
-        self._is_encoder_decoder_model = is_encoder_decoder_model
-
-    @property
-    def graph(self):
-        assert self._graph is not None
-        return self._graph
-
-    def capture(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_inputs: Optional[IntermediateTensors],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        memory_pool: Optional[Tuple[int, int]],
-        stream: torch.cuda.Stream,
-        **kwargs,
-    ):
-        assert self._graph is None
-        # Run the model a few times without capturing the graph.
-        # This is to make sure that the captured graph does not include the
-        # kernel launches for initial benchmarking (e.g., Triton autotune).
-        # Note one iteration is not enough for torch.compile
-        for _ in range(_NUM_WARMUP_ITERS):
-            self.model(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                positions=positions,
-                intermediate_tensors=intermediate_inputs,
-                **kwargs,
-            )
-        # Wait for the warm up operations to finish before proceeding with
-        # Graph Capture.
-        torch.cuda.synchronize()
-        # Capture the graph.
-        self._graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            output_hidden_or_intermediate_states = self.model(
-                input_ids=input_ids,
-                **({
-                    "inputs_embeds": inputs_embeds,
-                } if inputs_embeds is not None else {}),
-                positions=positions,
-                intermediate_tensors=intermediate_inputs,
-                **kwargs,
-            )
-
-            if isinstance(output_hidden_or_intermediate_states, torch.Tensor):
-                hidden_or_intermediate_states = weak_ref_tensor(
-                    output_hidden_or_intermediate_states)
-            elif isinstance(output_hidden_or_intermediate_states,
-                            IntermediateTensors):
-                hidden_or_intermediate_states = IntermediateTensors(
-                    tensors={
-                        key: weak_ref_tensor(value)
-                        for key, value in
-                        output_hidden_or_intermediate_states.tensors.items()
-                    })
-
-            del output_hidden_or_intermediate_states
-            # make sure `output_hidden_or_intermediate_states` is deleted
-            # in the graph's memory pool
-            gc.collect()
-        torch.cuda.synchronize()
-
-        # Save the input and output buffers.
-        self.input_buffers = {
-            "input_ids":
-            input_ids,
-            **({
-                "inputs_embeds": inputs_embeds,
-            } if inputs_embeds is not None else {}),
-            "positions":
-            positions,
-            "kv_caches":
-            kv_caches,
-            **self.attn_state.get_graph_input_buffers(
-                attn_metadata, self._is_encoder_decoder_model),
-            **kwargs,
-        }
-        if intermediate_inputs is not None:
-            self.input_buffers.update(intermediate_inputs.tensors)
-        if get_pp_group().is_last_rank:
-            self.output_buffers = {
-                "hidden_states": hidden_or_intermediate_states
-            }
-        else:
-            self.output_buffers = hidden_or_intermediate_states
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors],
-        **kwargs,
-    ) -> torch.Tensor:
-        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
-
-        # Copy the input tensors to the input buffers.
-        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
-        if positions is not None:
-            # in some case like MLA, it will reuse positions in metadata
-            # but truncate them to the original size
-            # so the shape is not padded, we need to copy partial only
-            self.input_buffers["positions"][:positions.shape[0]].copy_(
-                positions, non_blocking=True)
-        if inputs_embeds is not None:
-            self.input_buffers["inputs_embeds"][:inputs_embeds.shape[0]].copy_(
-                inputs_embeds, non_blocking=True)
-
-        if self.backend_name != "NO_ATTENTION":
-            self.input_buffers["slot_mapping"].copy_(
-                attn_metadata.slot_mapping, non_blocking=True)
-
-        self.attn_state.prepare_graph_input_buffers(
-            self.input_buffers, attn_metadata, self._is_encoder_decoder_model)
-
-        if "seqlen_agnostic_capture_inputs" in self.input_buffers:
-            self.model.copy_inputs_before_cuda_graphs(self.input_buffers,
-                                                      **kwargs)
-
-        if "previous_hidden_states" in self.input_buffers:
-            self.input_buffers["previous_hidden_states"].copy_(
-                kwargs["previous_hidden_states"], non_blocking=True)
-
-        if intermediate_tensors is not None:
-            for key in intermediate_tensors.tensors:
-                if key != "model_execute_time" and key != "model_forward_time":
-                    self.input_buffers[key].copy_(intermediate_tensors[key],
-                                                  non_blocking=True)
-        if self._is_encoder_decoder_model:
-            self.input_buffers["encoder_input_ids"].copy_(
-                kwargs['encoder_input_ids'], non_blocking=True)
-            self.input_buffers["encoder_positions"].copy_(
-                kwargs['encoder_positions'], non_blocking=True)
-
-        # Run the graph.
-        self.graph.replay()
-        # Return the output tensor.
-        if get_pp_group().is_last_rank:
-            return self.output_buffers["hidden_states"]
-
-        return self.output_buffers
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
deleted file mode 100644
index 7b8fe2f802d6..000000000000
--- a/vllm/worker/model_runner_base.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type,
-                    TypeVar)
-
-import torch
-import torch.nn as nn
-
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.models.interfaces import supports_transcription
-from vllm.model_executor.models.interfaces_base import (
-    is_pooling_model, is_text_generation_model)
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
-
-if TYPE_CHECKING:
-    from vllm.attention import AttentionMetadata
-    from vllm.attention.backends.abstract import AttentionBackend
-    from vllm.model_executor import SamplingMetadata
-
-logger = init_logger(__name__)
-
-T = TypeVar('T', bound="BroadcastableModelInput")
-
-
-def _add_attn_metadata_broadcastable_dict(
-        tensor_dict: Dict[str, Any],
-        attn_metadata: Optional["AttentionMetadata"]) -> None:
-    """
-    Helper method to update tensor_dict with broadcastable
-    AttentionMetadata fields.
-    """
-    if attn_metadata is not None:
-        tensor_dict.update(attn_metadata.asdict_zerocopy())
-
-
-def _init_attn_metadata_from_tensor_dict(
-    attn_backend: "AttentionBackend",
-    tensor_dict: Dict[str, Any],
-) -> Dict[str, Any]:
-    """
-    Helper method to initialize AttentionMetadata based on an
-    AttentionBackend and broadcastable AttentionMetadata fields.
-    """
-    # Extract the fields used to create AttentionMetadata.
-    valid_attn_kwargs = {}
-    for field in dataclasses.fields(attn_backend.get_metadata_cls()):
-        if field.name in tensor_dict:
-            if field.name == "input_positions":
-                valid_attn_kwargs[field.name] = tensor_dict[field.name]
-            else:
-                valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
-
-    attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
-    tensor_dict["attn_metadata"] = attn_metadata
-    return tensor_dict
-
-
-def _init_sampling_metadata_from_tensor_dict(  # type: ignore
-        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Helper method to initialize SamplingMetadata based on broadcastable
-    SamplingMetadata fields.
-    """
-    from vllm.model_executor import SamplingMetadata
-
-    selected_token_indices = tensor_dict.pop("selected_token_indices", None)
-    # An empty SamplingMetadata to signal that the worker should skip
-    # sampling.
-    if selected_token_indices is not None:
-        tensor_dict["sampling_metadata"] = SamplingMetadata(
-            seq_groups=None,
-            selected_token_indices=selected_token_indices,
-            categorized_sample_indices=None,
-            num_prompts=0,
-        )
-    return tensor_dict
-
-
-def _add_sampling_metadata_broadcastable_dict(
-        tensor_dict: Dict[str, Any],
-        sampling_metadata: Optional["SamplingMetadata"]) -> None:
-    """
-    Helper method to update tensor_dict with broadcastable
-    SamplingMetadata fields.
-    """
-    if sampling_metadata is not None:
-        tensor_dict["selected_token_indices"] = (
-            sampling_metadata.selected_token_indices)
-
-
-def _init_frozen_model_input_from_tensor_dict(
-        frozen_model_input_cls: Type["ModelRunnerInputBase"],
-        tensor_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Helper method to initialize a frozen ModelInput based on broadcastable
-    """
-    valid_tensor_kwargs = {}
-    for field in dataclasses.fields(frozen_model_input_cls):
-        val = tensor_dict.pop(field.name, None)
-        if val is not None:
-            valid_tensor_kwargs[field.name] = val
-
-    frozen_model_input = frozen_model_input_cls(**valid_tensor_kwargs)
-    tensor_dict["frozen_model_input"] = frozen_model_input
-    return tensor_dict
-
-
-class BroadcastableModelInput(ABC):
-
-    @abstractmethod
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        """
-        Extract broadcastable fields. Override for fields that require some
-        custom deserialization.
-        """
-        raise NotImplementedError
-
-    @classmethod
-    @abstractmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type[T],
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> T:
-        """
-        Pop fields from the given tensor_dict and populate a new instance of
-        BroadcastableModelInput.
-        """
-        raise NotImplementedError
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelRunnerInputBase(BroadcastableModelInput):
-    """Local inputs to each worker's model runner. May contain
-    device-specific data. Different worker backends may have different methods
-    of converting from the global ExecuteModelRequest produced by the LLM
-    engine to the worker-local ModelRunnerInputBase objects.
-
-    Model runners that support multi-GPU execution should define a
-    ModelRunnerInputBase subclass, add their required fields, and specify how to
-    serialize/deserialize a ModelInput for broadcast between workers.
-    """
-    pass
-
-
-class ModelRunnerInputBuilderBase(ABC, Generic[T]):
-    """A builder to create ModelRunnerInputBase objects.
-  """
-
-    @abstractmethod
-    def prepare(self,
-                finished_requests_ids: Optional[List[str]] = None) -> None:
-        raise NotImplementedError
-
-    @abstractmethod
-    def add_seq_group(self, seq_group_metadata):
-        """TBA"""
-        raise NotImplementedError
-
-    @abstractmethod
-    def build(self, *args, **kwargs) -> T:
-        """Build metadata with on-device tensors."""
-        raise NotImplementedError
-
-
-class ModelRunnerBase(ABC, Generic[T]):
-    """
-    Model runner interface that abstracts a particular hardware and/or type of
-    model. Model execution may communicate data with model runners in other
-    processes, but it should not include control plane metadata communication.
-
-    Each ModelRunnerBase subclass should define a corresponding
-    ModelRunnerInputBase subclass.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-
-    # Map of request_id -> generator used for seeded random sampling
-    generators: Dict[str, torch.Generator] = {}
-
-    @abstractmethod
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> T:
-        """
-        Make an instance of a ModelRunnerInputBase from the broadcasted tensor
-        dict.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None,
-    ) -> T:
-        """
-        Prepare the inputs to ModelRunnerBase.execute_model from an execution
-        request. This method may move data to the worker's local device. It is
-        not allowed to communicate with other workers or devices.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_model(self) -> nn.Module:
-        raise NotImplementedError
-
-    def get_supported_generation_tasks(self) -> list[GenerationTask]:
-        model = self.get_model()
-        supported_tasks = list[GenerationTask]()
-
-        if is_text_generation_model(model):
-            supported_tasks.append("generate")
-
-        if supports_transcription(model):
-            if model.supports_transcription_only:
-                return ["transcription"]
-
-            supported_tasks.append("transcription")
-
-        return supported_tasks
-
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        model = self.get_model()
-        if not is_pooling_model(model):
-            return []
-
-        return list(model.pooler.get_supported_tasks())
-
-    def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
-        tasks = list[SupportedTask]()
-
-        if self.model_config.runner_type == "generate":
-            tasks.extend(self.get_supported_generation_tasks())
-        if self.model_config.runner_type == "pooling":
-            tasks.extend(self.get_supported_pooling_tasks())
-
-        return tuple(tasks)
-
-    def execute_model(
-        self,
-        model_input: T,
-        kv_caches: Optional[List[torch.Tensor]],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-        **kwargs,
-    ) -> Optional[List[SamplerOutput]]:
-        """
-        Execute the model on the given input.
-        """
-        raise NotImplementedError
-
-    def get_generators(self, finished_request_ids: Optional[List[str]] = None):
-        """
-        Return dict of per-request generators used for random sampling.
-        """
-
-        # Clean up generators from completed requests
-        if finished_request_ids:
-            for request_id in finished_request_ids:
-                self.generators.pop(request_id, None)
-
-        return self.generators
-
-
-class ModelRunnerWrapperBase:
-    """
-    The whole point of this class is to lazily initialize the model_runner.
-    """
-
-    def __init__(
-        self,
-        model_runner: ModelRunnerBase,
-    ) -> None:
-        self.model_runner: ModelRunnerBase = model_runner
-
-    def __getattr__(self, attr):
-        return getattr(self.model_runner, attr)
-
-
-class InputProcessingError(Exception):
-    """This exception is raised when an error occurs preparing the inputs for
-    a single sequence group.
-    This allows the engine to gracefully handle errors with a single sequence
-    group without having to fail the entire batch.
-    """
-
-    def __init__(self, request_id, message):
-        """request_id is the id of the offending sequence group"""
-        self.request_id = request_id
-        self.message = message
-        super().__init__(self.message)
-
-    def __str__(self):
-        return "Failed to prepare inputs for sequence group with request id: " \
-                f"{self.request_id}, Error: {self.message}"
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
deleted file mode 100644
index 2aa910bdff6b..000000000000
--- a/vllm/worker/multi_step_model_runner.py
+++ /dev/null
@@ -1,908 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import functools
-from dataclasses import dataclass, field
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple,
-                    Union)
-
-import torch
-
-from vllm.distributed import get_pp_group
-from vllm.logger import init_logger
-from vllm.model_executor.layers.sampler import (PromptLogprobs, SampleLogprobs,
-                                                SamplerOutput,
-                                                SamplingMetadata, get_logprobs,
-                                                get_pythonized_sample_results)
-from vllm.platforms import current_platform
-from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors,
-                           Logprob, SequenceGroupMetadata, SequenceOutput)
-from vllm.utils import PyObjectCache, async_tensor_h2d, current_stream
-from vllm.worker.model_runner import (GPUModelRunnerBase,
-                                      ModelInputForGPUWithSamplingMetadata)
-from vllm.worker.model_runner_base import (
-    BroadcastableModelInput, _init_attn_metadata_from_tensor_dict,
-    _init_frozen_model_input_from_tensor_dict,
-    _init_sampling_metadata_from_tensor_dict)
-
-from ..model_executor.model_loader.tensorizer import TensorizerConfig
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-MULTI_STEP_ATTENTION_BACKENDS = [
-    "FLASH_ATTN", "ROCM_FLASH", "FLASHINFER", "NO_ATTENTION"
-]
-MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN", "FLASHINFER"]
-
-def _get_supported_attention_backends(chunked_prefill_enabled: bool) \
-    -> List[str]:
-    if chunked_prefill_enabled:
-        return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS
-    else:
-        return MULTI_STEP_ATTENTION_BACKENDS
-
-
-def seq_output_builder():
-    return SequenceOutput(
-        0, 0,
-        {0: Logprob(logprob=float('inf'), rank=None, decoded_token=None)})
-
-
-def completion_seq_group_output_builder():
-    return CompletionSequenceGroupOutput([], None)
-
-
-# Used by pythonization to reduce python object allocations
-class PythonizationCache:
-
-    def __init__(self):
-        self.cached_seq_output = PyObjectCache(seq_output_builder)
-        self.cached_completion_seq_group_output = PyObjectCache(
-            completion_seq_group_output_builder)
-
-    def reset(self):
-        self.cached_seq_output.reset()
-        self.cached_completion_seq_group_output.reset()
-
-
-@dataclass
-class ModelOutput:
-    """The output of a single model forward pass.
-
-    The sampler_output_ready_event is set when the tensors in
-    sampler_output are ready (the model+sampler forward pass has
-    completed). We use the event to synchronize the GPU->CPU transfer,
-    which we want to only run when the data has been written to the
-    GPU tensors. Until the event is ready, the tensors in sampler_output
-    will have garbage data.
-
-    There are two scenarios:
-    1. The output tensors are ready and we can pythonize them immediately.
-    2. The output tensors are not ready and we need to wait for the event to be
-    ready.
-    """
-    sampler_output: SamplerOutput
-    sampler_output_ready_event: torch.cuda.Event
-    sampled_token_ids: Optional[torch.Tensor] = None
-    pythonized: bool = False
-    # On-device tensor containing the logprobs of each token.
-    logprobs: Optional["torch.Tensor"] = None
-    pythonization_cache: Optional[PythonizationCache] = None
-
-    def pythonize(self, input_metadata: "StatefulModelInput",
-                  copy_stream: torch.cuda.Stream,
-                  pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output. Blocking."""
-        if not self.pythonized:
-            self._pythonize_sampler_output(input_metadata, copy_stream,
-                                           pinned_sampled_token_buffer, True)
-            self.pythonized = True
-
-    def maybe_pythonize(self, input_metadata: "StatefulModelInput",
-                        copy_stream: torch.cuda.Stream,
-                        pinned_sampled_token_buffer: torch.Tensor) -> None:
-        """Pythonize the output if ready, else return None. Non-blocking."""
-        if not self.pythonized:
-            self.pythonized = self._pythonize_sampler_output(
-                input_metadata, copy_stream, pinned_sampled_token_buffer,
-                False)
-
-    def _pythonize_sampler_output(self, input_metadata: "StatefulModelInput",
-                                  copy_stream: torch.cuda.Stream,
-                                  pinned_sampled_token_buffer: torch.Tensor,
-                                  blocking: bool) -> bool:
-        """
-        If blocking is set, will block until the forward pass for the output is
-        ready and pythonize the output. Upon completing Pythonization, erases
-        self.logprobs (note that a non-blocking call that is performed when
-        the sampler output is not yet ready, will not erase self.logprobs.)
-        """
-        assert self.sampled_token_ids is not None
-        if not blocking and not self.sampler_output_ready_event.query():
-            return False
-
-        if blocking:
-            self.sampler_output_ready_event.synchronize()
-        with torch.cuda.stream(copy_stream):
-            _pythonize_sampler_output(input_metadata, self.sampler_output,
-                                      pinned_sampled_token_buffer,
-                                      self.sampled_token_ids, self.logprobs,
-                                      self.pythonization_cache)
-
-        # Erase the logprobs GPU-side tensor.
-        # Note that although _pythonize_sampler_output() runs in its
-        # own CUDA stream, nonetheless _pythonize_sampler_output()
-        # cannot return until Pythonization is complete; therefore
-        # we know that by the time the CPU reaches this point,
-        # `self.logprobs` is no longer needed.
-        self.logprobs = None
-        return True
-
-
-@dataclass(frozen=False)
-class StatefulModelInput(BroadcastableModelInput):
-    # actual frozen model input dataclass passed to _base_model_runner
-    frozen_model_input: Optional[ModelInputForGPUWithSamplingMetadata] = None
-
-    # list of model outputs for each step, may not be all pythonized
-    cached_outputs: List[ModelOutput] = field(default_factory=list)
-
-    # used to pass sampled token ids from the last step to the current step for
-    # TP workers. Used to append to end of outputs and used by advance_step
-    last_sampled_token_ids: Optional[torch.Tensor] = None
-    current_step: int = 0
-    is_multi_step: bool = True
-    is_last_step: bool = False
-    is_first_multi_step: bool = False
-    base_output_proc_callback: Optional[Callable] = None
-    # ping-pong data structures for multi-step to wait on the previous step
-    step_cuda_events: List[current_platform.Event] = field(
-        default_factory=lambda: [current_platform.Event(blocking=True)] * 2)
-    num_seqs: int = -1
-    num_queries: int = -1
-    num_single_step_prefills: int = 0
-
-    def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
-        assert self.frozen_model_input is not None
-        tensor_dict = self.frozen_model_input.as_broadcastable_tensor_dict()
-        new_tensor_dict = {
-            'last_sampled_token_ids': self.last_sampled_token_ids,
-            'current_step': self.current_step,
-            'is_multi_step': self.is_multi_step,
-            'is_last_step': self.is_last_step,
-            'is_first_multi_step': self.is_first_multi_step,
-            'num_seqs': self.num_seqs,
-            'num_queries': self.num_queries,
-            'num_single_step_prefills': self.num_single_step_prefills,
-        }
-        tensor_dict.update(new_tensor_dict)
-        return tensor_dict
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "StatefulModelInput":
-        tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict)
-        if attn_backend is not None:
-            tensor_dict = _init_attn_metadata_from_tensor_dict(
-                attn_backend, tensor_dict)
-        tensor_dict = _init_frozen_model_input_from_tensor_dict(
-            ModelInputForGPUWithSamplingMetadata, tensor_dict)
-
-        return cls(**tensor_dict)
-
-    def record_step_event(self, current_stream: torch.cuda.Stream):
-        # record the event for the current step so that the next step can sync
-        # on it. We modulo by 2 to keep the events in a circular buffer and
-        # support any attn backends that may be supported in the future. ie
-        # Flashinfer would want two DecodeWrappers to overlap the CPU and GPU.
-        self.step_cuda_events[self.current_step & 1] = \
-            torch.cuda.Event(blocking=True)
-        self.step_cuda_events[self.current_step & 1].record(current_stream)
-
-    def wait_previous_step(self):
-        # These cuda events are an explicit synchronization to ensure that
-        # advance_step() (for other attn backends that may be supported in the
-        # future) do not clobber any data structures that is also used by any
-        # enqueued forwards steps. For distributed case, only a single event is
-        # needed, but for single GPU case, since we can let the CPU run much
-        # further ahead, two events allow us to overlap the advance_step with
-        # the previous forward (ie using two DecodeWrappers for flashinfer
-        # backend)
-        self.step_cuda_events[(self.current_step + 1) & 1].wait()
-
-    def add_sampler_output(self,
-                           sampler_output: SamplerOutput,
-                           sampled_token_ids: Optional[torch.Tensor] = None):
-        self.cached_outputs.append(
-            ModelOutput(sampler_output=sampler_output,
-                        sampler_output_ready_event=None,
-                        sampled_token_ids=sampled_token_ids,
-                        pythonized=False))
-
-    def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool):
-        """
-        sampling_metadata.selected_token_indices is constructed for the
-        first-step in Multi-Step. However, when chunked-prefill is enabled with
-        multi-step, the scheduled prompts are fully processed in the
-        first-step and are processed as decodes in the rest of the steps.
-        This function updates the sampling_metadata.selected_token_indices
-        to account for this conversion.
-
-        Example:
-        Let 2 prompts and 2 decodes be scheduled together. Let the
-        num-tokens to process for the 2 prompts be 5 and 8 respectively.
-
-        In that case, sampling_metadata.sampled_token_indices will be,
-        [4, 12, 13, 14] as it is constructed for the first-step in
-        multi-step.
-        However, the prompts turns to decodes after the first-step
-        and the num-tokens for the previously-prompt sequences will
-        be 1 and 1 as they are decodes now. The self.sampled_token_indices
-        must be updated to [0,1,2,3].
-        """
-        assert self.current_step == 1 and self.num_single_step_prefills > 0
-        if not get_pp_group().is_last_rank:
-            return
-
-        assert self.frozen_model_input is not None
-        assert self.frozen_model_input.sampling_metadata is not None
-        self.frozen_model_input.sampling_metadata.selected_token_indices =  \
-            async_tensor_h2d(list(range(self.num_queries)),
-                             dtype=torch.long,
-                             target_device=device,
-                             pin_memory=pin_memory)
-
-    def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool):
-        """
-        Advancing the datastructures of StatefulModelInput::frozen_model_input
-        is only required when prefills are scheduled with decodes to run in
-        multi-step. This advancement/correction is required to account for
-        the conversion of Prefills to Decodes after the first multi-step.
-        """
-        if self.current_step != 1 or self.num_single_step_prefills == 0:
-            return
-
-        assert self.frozen_model_input is not None
-        fmi = self.frozen_model_input
-
-        # Truncate input_tokens
-        assert fmi.input_tokens is not None
-        assert fmi.input_tokens.shape[0] >= self.num_seqs
-        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
-
-        # Update frozen_model_input::input_positions.
-        assert fmi.input_positions is not None
-        assert fmi.input_positions.shape[0] >= self.num_seqs
-        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
-                                                                    num_seqs]
-
-        # Assert unsupported
-        assert fmi.lora_mapping is None
-        assert fmi.lora_requests is not None
-        assert len(fmi.lora_requests) == 0
-        assert fmi.attn_metadata is not None
-        assert fmi.multi_modal_kwargs is not None
-        assert len(fmi.multi_modal_kwargs) == 0
-
-        self.frozen_model_input = dataclasses.replace(
-            self.frozen_model_input,
-            input_tokens=fmi_new_input_tokens,
-            input_positions=fmi_new_input_positions)
-
-        self.maybe_advance_sampling_metadata(device, pin_memory)
-
-
-# MutableModelInputForGPUWithMultiStepMetadata is not subclass of
-# ModelInputForGPU but it wraps the actual input dataclass and adds multi-step
-# metadata
-# mypy: disable-error-code=type-var
-class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]):
-    # mypy: enable-error-code=type-var
-
-    def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
-
-        super().__init__(*args, **kwargs)
-
-        # Check attention backend support.
-        supported_attention_backends: List[str] = \
-            _get_supported_attention_backends(
-                self.scheduler_config.chunked_prefill_enabled)
-        if self.attn_backend.get_name() not in supported_attention_backends:
-            ms_config_str: str = "Multi-Step + Chunked-Prefill" \
-                if self.scheduler_config.chunked_prefill_enabled \
-                      else "Multi-Step"
-            raise ValueError(
-                f"{ms_config_str} not supported for attention backend: "
-                f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND "
-                f"to a value from {supported_attention_backends}.")
-
-        # uses the base model runner to execute the model and wraps it with
-        # multi-step logic
-        self._base_model_runner: GPUModelRunnerBase = base_model_runner
-
-        self.is_multi_step = self.scheduler_config.is_multi_step
-        self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
-
-        # Using the PythonizationCache in Pipeline-Parallel clobbers the
-        # SequenceOutput and CompletionSequenceGroupOutput object.
-        # When cache-reset happens at the last step of a multi-step
-        # execution, there may be other on-going single-step/multi-step
-        # executions. The current caching implementation does not check
-        # for this.
-        self.pythonization_cache = PythonizationCache() \
-            if self.parallel_config.pipeline_parallel_size == 1 else None
-
-    @functools.cached_property
-    def _copy_stream(self):
-        # used to copy tensors from GPU to CPU asynchronously
-        return torch.cuda.Stream()
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
-        model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        ))
-        return model_input
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> StatefulModelInput:
-        frozen_model_input: ModelInputForGPUWithSamplingMetadata = \
-              self._base_model_runner.prepare_model_input(
-                    seq_group_metadata_list,
-                    virtual_engine,
-                    finished_requests_ids)
-
-        assert frozen_model_input.query_lens is not None
-        assert frozen_model_input.seq_lens is not None
-        assert frozen_model_input.attn_metadata is not None
-        num_queries = len(frozen_model_input.query_lens)
-        num_seqs = len(frozen_model_input.seq_lens)
-        num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills
-
-        model_input = StatefulModelInput(
-            frozen_model_input=frozen_model_input,
-            num_seqs=num_seqs,
-            num_queries=num_queries,
-            num_single_step_prefills=num_single_step_prefills)
-
-        return model_input
-
-    def _async_process_outputs(self, model_input: StatefulModelInput,
-                               output_proc_callback: Callable):
-        # Proceed with pythonization and output_proc in order.
-        # Stop on the first one that fails to pythonize
-        output_proc_callback()
-
-        cont = True
-        for step_num, model_output in enumerate(model_input.cached_outputs):
-            if not model_output.pythonized:
-                model_output.maybe_pythonize(model_input, self._copy_stream,
-                                             self.pinned_sampled_token_ids)
-                if model_output.pythonized:
-                    ctx = output_proc_callback.keywords["ctx"]
-                    ctx.append_output(
-                        outputs=[model_output.sampler_output],
-                        seq_group_metadata_list=ctx.seq_group_metadata_list,
-                        scheduler_outputs=ctx.scheduler_outputs,
-                        is_async=False,
-                        is_last_step=False,
-                        is_first_step_output=step_num == 0)
-
-                    output_proc_callback()
-                else:
-                    cont = False
-
-            if not cont:
-                break
-
-    def _final_process_outputs(
-            self, model_input: StatefulModelInput,
-            output_proc_callback: Optional[Callable]) -> List[SamplerOutput]:
-        assert model_input.frozen_model_input is not None
-
-        has_async_callback = output_proc_callback is not None
-
-        outputs = []
-        for step_num, output in enumerate(model_input.cached_outputs):
-            is_last_step = step_num == len(model_input.cached_outputs) - 1
-
-            # For non-async case:
-            #   -- We simply add the outputs
-            # For async case:
-            #   -- Invoke callback, pythonize, add to callback queue and repeat
-            #   -- For last output, just add to callback queue
-            if has_async_callback:
-                assert output_proc_callback is not None
-
-                # Invoke callback before pythonize (to overlap with GPU)
-                output_proc_callback()
-
-                # Pythonize
-                if not output.pythonized:
-                    output.pythonize(model_input, self._copy_stream,
-                                     self.pinned_sampled_token_ids)
-
-                    # For non last step, add to callback queue to chain
-                    # callbacks=>pythonize pairs (for GPU overlap)
-                    if not is_last_step:
-                        ctx = output_proc_callback.keywords[  # type: ignore
-                            "ctx"]  # type: ignore
-                        ctx.append_output(
-                            outputs=[output.sampler_output],
-                            seq_group_metadata_list=ctx.
-                            seq_group_metadata_list,
-                            scheduler_outputs=ctx.scheduler_outputs,
-                            is_async=False,
-                            is_last_step=False,
-                            is_first_step_output=step_num == 0)
-                    else:
-                        outputs.append(output.sampler_output)
-            else:
-                output.pythonize(model_input, self._copy_stream,
-                                 self.pinned_sampled_token_ids)
-                outputs.append(output.sampler_output)
-
-        return outputs
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: StatefulModelInput,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-        """ 
-        Execute the model for a single step and update multi-step
-        metadata
-        """
-        assert num_steps == 1, "MultiStepModelRunner only supports num_steps=1"
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-
-        # path for warm up runs
-        if not model_input.is_multi_step:
-            return self._base_model_runner.execute_model(
-                frozen_model_input, None, intermediate_tensors, num_steps)
-
-        # make sure we skip the sampler on the lask rank and only pythonize
-        # if CPU is ahead.
-        if self.is_driver_worker and get_pp_group().is_last_rank:
-            if self.pinned_sampled_token_ids is None:
-                self.pinned_sampled_token_ids = torch.zeros(
-                    (self.scheduler_config.max_num_seqs, 1),
-                    dtype=torch.long,
-                    device="cpu",
-                    pin_memory=True)
-
-            self._base_model_runner.sampler.include_gpu_probs_tensor = True
-            if frozen_model_input.sampling_metadata:
-                frozen_model_input.sampling_metadata.skip_sampler_cpu_output = (
-                    True)
-
-        # some pre-execute model logic for multi-step:
-        #   - if it's the first step, we need to reset the sampling tensors
-        #   - if it's not the first step, we need to advance the step using the
-        #   appended sampler output from last iteration
-        #   - also maybe pythonize if CPU is ahead of GPU
-
-        stream = current_stream()
-        if not model_input.is_first_multi_step:
-            # Explicitly block on the previous step's forward to make sure we
-            # don't clobber any GPU tensors still in use.
-            # This is not needed for flashattn backend, but for other attn
-            # backends such as flashinfer that performs extra CPU operations on
-            # input metadata we may need to synchronize any CPU operations that
-            # might clobber enqueued forwards. (prevents CPU from running too
-            # far ahead if needed)
-            model_input.wait_previous_step()
-            model_input = self._advance_step(
-                model_input, model_input.cached_outputs[-1].sampler_output)
-
-            # frozen_model_input may have been updated
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        if model_input.base_output_proc_callback is None:
-            assert frozen_model_input is not None
-            model_input.base_output_proc_callback = \
-                        frozen_model_input.async_callback
-
-        if frozen_model_input.async_callback is not None:
-            assert model_input.base_output_proc_callback is not None
-            async_callback = functools.partial(
-                self._async_process_outputs,
-                model_input=model_input,
-                output_proc_callback=model_input.base_output_proc_callback)
-
-            model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                model_input.frozen_model_input,
-                async_callback=async_callback)
-            # Update the local instance
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-
-        # Execute the model
-        output = self._base_model_runner.execute_model(frozen_model_input,
-                                                       None,
-                                                       intermediate_tensors,
-                                                       num_steps=1)
-
-        # record the event for the current step so that the next step can sync
-        model_input.record_step_event(stream)
-
-        if get_pp_group().is_last_rank and self.is_driver_worker:
-            assert isinstance(output, list)
-            assert len(
-                output
-            ) == 1, "MultiStepModelRunner requires single-step base_models"
-
-            # event for the pythonization so that we only pythonize if the
-            # tensors are ready. May be able to be combined with the step event
-            output_ready_event = torch.cuda.Event()
-            output_ready_event.record(stream)
-            if self.parallel_config.pipeline_parallel_size > 1:
-                output[0].sampled_token_ids_cpu = output[
-                    0].sampled_token_ids.cpu()
-            model_input.cached_outputs.append(
-                ModelOutput(output[0], output_ready_event,
-                            output[0].sampled_token_ids, False,
-                            output[0].logprobs, self.pythonization_cache))
-
-            # These GPU tensors are not required by multi-step;
-            # erase them to ensure they are not pythonized or
-            # transferred to CPU
-            output[0].sampled_token_ids = None
-            output[0].sampled_token_probs = None
-            output[0].logprobs = None
-
-            # Pythonize the output if CPU is ahead and the previous step is
-            # ready.
-            if frozen_model_input.async_callback is None:
-                for model_output in model_input.cached_outputs:
-                    model_output.maybe_pythonize(model_input,
-                                                 self._copy_stream,
-                                                 self.pinned_sampled_token_ids)
-
-        model_input.current_step += 1
-
-        if not get_pp_group().is_last_rank:
-            # Should be IntermediateTensors
-            assert isinstance(output, IntermediateTensors)
-            return output
-        if not self.is_driver_worker:
-            return []
-
-        # Pythonize the output and block if needed since it is the last step
-        if model_input.is_last_step:
-            outputs = self._final_process_outputs(
-                model_input, model_input.base_output_proc_callback)
-            if self.pythonization_cache:
-                self.pythonization_cache.reset()
-            return outputs
-
-        # should be [SamplerOutput]
-        return output
-
-    def _update_sampling_metadata(self, sampling_metadata: SamplingMetadata,
-                                  num_seqs: Optional[int], num_queries: int):
-
-        assert sampling_metadata.num_prompts == 0
-        assert len(sampling_metadata.seq_groups) == num_queries
-        assert sampling_metadata.selected_token_indices.shape == (
-            num_queries, )
-        # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501
-
-        # Verify that all sequences are decodes
-        for i in range(num_queries):
-            seq_group = sampling_metadata.seq_groups[i]
-
-            assert seq_group.is_prompt is False  # No prompt
-            assert seq_group.prompt_logprob_indices == []  # No prompt
-            assert seq_group.sample_indices == [i]  # Simple
-            assert seq_group.seq_len is None  # Decode
-            assert seq_group.query_len is None  # Decode
-
-    def _advance_step(self, model_input: StatefulModelInput,
-                      out: SamplerOutput) -> StatefulModelInput:
-
-        model_input.maybe_advance_frozen_model_input(self.device,
-                                                     self.pin_memory)
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        assert frozen_model_input.input_tokens is not None
-        assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs
-        assert frozen_model_input.attn_metadata is not None
-
-        sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids
-        num_seqs = model_input.num_seqs
-        num_queries = model_input.num_queries
-        frozen_model_input = model_input.frozen_model_input
-        assert frozen_model_input is not None
-        attn_metadata = frozen_model_input.attn_metadata
-        assert attn_metadata is not None
-
-        turn_prefills_into_decodes: bool = model_input.current_step == 1 and \
-                                    model_input.num_single_step_prefills != 0
-        attn_metadata.advance_step(
-            frozen_model_input,
-            sampled_token_ids,
-            self.block_size,
-            num_seqs,
-            num_queries,
-            turn_prefills_into_decodes=turn_prefills_into_decodes)
-
-        return model_input
-
-    def load_model(self) -> None:
-        self._base_model_runner.load_model()
-        self.model_memory_usage = self._base_model_runner.model_memory_usage
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        return self._base_model_runner.save_sharded_state(
-            path, pattern, max_size)
-
-    def save_tensorized_model(self,
-                              tensorizer_config: TensorizerConfig) -> None:
-        return self._base_model_runner.save_tensorized_model(tensorizer_config)
-
-    def profile_run(self) -> None:
-        return self._base_model_runner.profile_run()
-
-    def remove_all_loras(self):
-        return self._base_model_runner.remove_all_loras()
-
-    def capture_model(self, kv_caches: List[List]) -> None:
-        return self._base_model_runner.capture_model(kv_caches)
-
-    @property
-    def vocab_size(self) -> int:
-        return self._base_model_runner.vocab_size
-
-
-DeferredLogprobsReturnType = Tuple[Optional[List[Optional[PromptLogprobs]]],
-                                   Optional[List[SampleLogprobs]]]
-
-
-def deferred_pythonize_logprobs(
-    output: SamplerOutput,
-    sampling_metadata: SamplingMetadata,
-    logprobs_tensor: Optional[torch.Tensor],
-) -> DeferredLogprobsReturnType:
-    """Perform deferred logprob Pythonization.
-
-    1. Pythonize GPU-side sampler result tensors into CPU-side sampler result.
-    2. Pythonize GPU-side logprobs tensor into CPU-side logprobs lists,
-       utilizing  the Pythonized sampler result computed in step 1.
-    
-    These deferred computations are not required for single-step scheduling
-    or the `profile_run()` phase of multi-step scheduling.
-
-    Args:
-        output: sampler output (under deferred Pythonization)
-        sampling_metadata
-        
-    Returns:
-        prompt_logprobs (CPU), sample_logprobs (CPU)
-    """
-
-    # - Deferred pythonization of sample result
-    sampler_result = get_pythonized_sample_results(
-        output.deferred_sample_results_args)
-
-    # - Erase the GPU-side deferred sample_result
-    #   computation args to ensure it is never
-    #   pythonized or transferred to CPU
-    output.deferred_sample_results_args = None
-
-    # - Deferred pythonization of logprobs
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = get_logprobs(logprobs_tensor, sampling_metadata, sampler_result)
-    assert len(prompt_logprobs) == len(sampling_metadata.seq_groups)
-    assert len(sample_logprobs) == len(sampling_metadata.seq_groups)
-
-    return prompt_logprobs, sample_logprobs
-
-
-def _pythonize_sampler_output(
-    model_input: StatefulModelInput,
-    output: SamplerOutput,
-    pinned_sampled_token_buffer: torch.Tensor,
-    sampled_token_ids: torch.Tensor,
-    logprobs_tensor: Optional[torch.Tensor],
-    cache: Optional[PythonizationCache],
-) -> None:
-    """ This function is only called when the output tensors are ready.
-    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
-
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
-    adding a Pythonized output data structure
-    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
-    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
-
-    Args:
-      model_input
-      output: sampler output
-      pinned_sampled_token_token_buffer: CPU-side pinned memory
-                                         (receives copy of
-                                         GPU-side token buffer.)
-      sampled_token_ids: GPU-side token buffer
-      logprobs_tensor: GPU-side tensor containing 
-                       logprobs computed during sampling
-    """
-
-    assert model_input.frozen_model_input is not None
-
-    frozen_model_input = model_input.frozen_model_input
-    assert frozen_model_input.sampling_metadata is not None
-    sampling_metadata = frozen_model_input.sampling_metadata
-    # samples generation should have been skipped
-    assert not output.outputs
-
-    pinned_buffer = pinned_sampled_token_buffer[:model_input.num_queries]
-
-    # We guarantee output tensors are ready, so it is safe to
-    # pythonize the sampler output & obtain CPU-side logprobs.
-    #
-    # However we should check whether logprobs pythonization may
-    # be skipped entirely, i.e. because no logprobs were requested
-    # or pythonization was not deferred. To that end,
-    #
-    # * `prompt_logprobs_are_requested_for_prefill` signals that
-    #   there are *any* prefill-phase requests which specify that
-    #   prompt logprobs should be returned.
-    #
-    # * `any_logprobs_are_requested` signals that there are any
-    #   requests which (1) specify that sample logprobs should be
-    #   returned, or (2) are in the prefill phase AND specify that
-    #   prompt logprobs should be returned.
-    #
-    # Later on, these flags cause adjustments to the pythonization
-    # process to accommodate logprobs.
-
-    seq_groups = sampling_metadata.seq_groups
-    prompt_logprobs_are_requested_for_prefill = any([
-        sg.sampling_params.prompt_logprobs is not None and sg.is_prompt
-        for sg in seq_groups
-    ])
-    any_logprobs_are_requested = (
-        prompt_logprobs_are_requested_for_prefill
-        or any([sg.sampling_params.logprobs is not None for sg in seq_groups]))
-
-    if prompt_logprobs_are_requested_for_prefill:
-        # CPU GPU sync, after gathering *only* sampled tokens (since
-        # requesting prompt logprobs leads `sampled_token_ids` to
-        # include prompt token ids in addition to sampled token ids.)
-        sample_idx_tensor = torch.tensor(
-            [sdx for sg in seq_groups for sdx in sg.sample_indices])
-        pinned_buffer = pinned_buffer.copy_(
-            sampled_token_ids[sample_idx_tensor, :], non_blocking=False)
-    else:
-        # CPU GPU sync
-        pinned_buffer = pinned_buffer.copy_(sampled_token_ids,
-                                            non_blocking=False)
-
-    # this will not block as the tensors are already on CPU
-    samples_list = pinned_buffer.tolist()
-
-    skip_sampler_cpu_output = (
-        frozen_model_input.sampling_metadata.skip_sampler_cpu_output)
-
-    # *Don't* skip logprobs pythonization *if*:
-    # * Any requests require logprobs to be returned in this
-    # iteration AND
-    # * These requests are being scheduled in a fashion which
-    # defers pythonization (i.e. multi-step scheduling.)
-    do_pythonize_logprobs = (skip_sampler_cpu_output
-                             and any_logprobs_are_requested)
-    (
-        prompt_logprobs,
-        sample_logprobs,
-    ) = (deferred_pythonize_logprobs(output, sampling_metadata,
-                                     logprobs_tensor)
-         if do_pythonize_logprobs else (None, None))
-
-    for sgdx, (seq_group,
-               sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        # (Check for Guided Decoding)
-        if seq_group.sampling_params.logits_processors:
-            assert len(seq_group.sampling_params.logits_processors) == 0, (
-                "Logits Processors are not supported in multi-step decoding")
-
-        if do_pythonize_logprobs:
-            assert prompt_logprobs is not None
-            assert sample_logprobs is not None
-
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (  # Utilize deferred pythonization results
-                prompt_logprobs[sgdx],
-                sample_logprobs[sgdx],
-            )
-        elif any_logprobs_are_requested:
-            (
-                group_prompt_logprobs,
-                group_sample_logprobs,
-            ) = (
-                # profile_run: use already-computed logprobs
-                output.outputs[sgdx].prompt_logprobs,
-                [sample.logprobs for sample in output.outputs[sgdx].samples])
-
-        seq_ids = seq_group.seq_ids
-        next_token_ids = sample_result
-        parent_ids = [0]
-        seq_outputs: List[SequenceOutput]
-
-        if cache is not None:
-            completion_seq_group_output: CompletionSequenceGroupOutput = \
-                cache.cached_completion_seq_group_output.get_object()
-            completion_seq_group_output.samples.clear()
-            seq_outputs = completion_seq_group_output.samples
-        else:
-            seq_outputs = []
-
-        for tdx, (parent_id,
-                  next_token_id) in enumerate(zip(parent_ids, next_token_ids)):
-            if cache is not None:
-                seq_output: SequenceOutput = cache.cached_seq_output.get_object(
-                )
-                seq_output.parent_seq_id = seq_ids[parent_id]
-                seq_output.output_token = next_token_id
-
-                if any_logprobs_are_requested:
-                    seq_output.logprobs = group_sample_logprobs[tdx]
-                else:
-                    logprobs = next(iter(seq_output.logprobs.values()))
-                    seq_output.logprobs.clear()
-
-                    logprobs.logprob = float('inf')
-                    logprobs.rank = None
-                    logprobs.decoded_token = None
-
-                    seq_output.logprobs[next_token_id] = logprobs
-
-                seq_outputs.append(seq_output)
-
-            else:
-                seq_outputs.append(
-                    SequenceOutput(seq_ids[parent_id], next_token_id,
-                                   (group_sample_logprobs[tdx]
-                                    if any_logprobs_are_requested else {
-                                        next_token_id:
-                                        Logprob(logprob=float('inf'),
-                                                rank=None,
-                                                decoded_token=None)
-                                    })))
-        if cache is not None:
-            completion_seq_group_output.prompt_logprobs = \
-                group_prompt_logprobs if any_logprobs_are_requested else None
-            output.outputs.append(completion_seq_group_output)
-        else:
-            output.outputs.append(
-                CompletionSequenceGroupOutput(
-                    seq_outputs, (group_prompt_logprobs
-                                  if any_logprobs_are_requested else None)))
-
-    assert len(output.outputs) > 0
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
deleted file mode 100644
index 25f588077cb4..000000000000
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from importlib.util import find_spec
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-
-class MultiStepNeuronModelRunner(NeuronModelRunner):
-    """A model runner for multi step decoding using the transformers_neuronx
-    framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.speculation_config = self.speculative_config
-        from transformers_neuronx.config import GenerationConfig
-        self.speculation_config.draft_model_config.neuron_sampling_params = (
-            GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K
-        ))
-
-    def load_model(self) -> None:
-        if find_spec("transformers_neuronx") is not None:
-            from vllm.model_executor.model_loader.neuron import (
-                get_neuron_eagle_speculation_model,
-                get_neuron_speculation_model)
-            if self.speculation_config.speculative_token_tree is not None:
-                self.model = get_neuron_eagle_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-            else:
-                self.model = get_neuron_speculation_model(
-                    self.model_config,
-                    parallel_config=self.parallel_config,
-                    scheduler_config=self.scheduler_config,
-                    speculation_config=self.speculation_config)
-        else:
-            raise NotImplementedError(
-                "Supports only Transformer-NeuronX based models.")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
deleted file mode 100644
index dd521dd67dad..000000000000
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import List, Optional
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors
-from vllm.worker.neuronx_distributed_model_runner import (
-    NeuronxDistributedModelRunner)
-
-
-class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
-    """A model runner for multi-step decoding using the
-    neuronx-distributed-inference framework"""
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-
-    def load_model(self) -> None:
-        from vllm.model_executor.model_loader.neuronx_distributed import (
-            get_neuron_speculation_model)
-        self.model = get_neuron_speculation_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            speculation_config=self.speculative_config)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        sampling_params = torch.tensor([[
-            seq_group.sampling_params.top_k,
-            seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature,
-        ] for seq_group in model_input.sampling_metadata.seq_groups])
-
-        logits = self.model(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            input_block_ids=model_input.input_block_ids,
-            sampling_params=sampling_params,
-            **MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs or {},
-                device=self.device,
-            ),
-        )
-
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return output
diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py
deleted file mode 100644
index ea16e14f9ecd..000000000000
--- a/vllm/worker/multi_step_worker.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
-
-import torch
-
-from vllm.distributed import broadcast_tensor_dict, get_pp_group
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.model_runner_base import BroadcastableModelInput
-from vllm.worker.multi_step_model_runner import (MultiStepModelRunner,
-                                                 StatefulModelInput)
-from vllm.worker.worker import Worker, WorkerInput
-
-
-@dataclass
-class MultiStepState:
-    worker_input: WorkerInput
-    model_input: StatefulModelInput
-
-
-class MultiStepWorker(Worker):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        base_model_runner = self.model_runner
-        # for multi-step model, wrap the model runner with MultiStepModelRunner
-        self.model_runner = MultiStepModelRunner(
-            base_model_runner,
-            vllm_config=base_model_runner.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=base_model_runner.is_driver_worker,
-        )
-
-        pipeline_parallel_size = self.parallel_config.pipeline_parallel_size
-        self.multi_step_states: List[
-            Optional[MultiStepState]] = [None] * pipeline_parallel_size
-        self.temp_output = None
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """
-        Get the driver input and broadcast it to other workers.
-        """
-        assert self.is_driver_worker
-        virtual_engine = execute_model_req.virtual_engine
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        if is_first_multi_step:
-            # on first step we prepare the worker input and model input normally
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            model_input: StatefulModelInput = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-
-            if execute_model_req.async_callback:
-                model_input.frozen_model_input = dataclasses.replace(  # type: ignore
-                    model_input.frozen_model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            # on subsequent steps we reuse the worker input and model input
-            multi_step_state = self.multi_step_states[virtual_engine]
-            worker_input = multi_step_state.worker_input
-            model_input = multi_step_state.model_input
-            frozen_model_input = model_input.frozen_model_input
-            assert frozen_model_input is not None
-            assert frozen_model_input.attn_metadata is not None
-            # clear the cached metadata so that it can be recomputed on
-            # the workers.
-            frozen_model_input.attn_metadata._cached_prefill_metadata = None
-            frozen_model_input.attn_metadata._cached_decode_metadata = None
-
-        model_input.is_first_multi_step = is_first_multi_step
-        model_input.is_last_step = execute_model_req.is_last_step
-
-        if not is_first_multi_step:
-            # we broadcast the last sampled token ids to all TP workers so they
-            # can update their model input metadata in-place.
-            self._prepare_last_sampled_token_ids_for_tp_workers(
-                execute_model_req=execute_model_req, model_input=model_input)
-
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        # Retuning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-
-    def _prepare_last_sampled_token_ids_for_tp_workers(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        model_input: StatefulModelInput,
-    ) -> None:
-        """ 
-        Prepare the last sampled token ids for TP workers. If it's the last 
-        PP rank, then the last sampled token ids are already in the model_input.
-        If it is NOT the last PP rank, then we need to get the last sampled
-        token that is cached in the execute_model_req.
-        """
-        if get_pp_group().is_last_rank:
-            assert model_input.cached_outputs[
-                -1].sampler_output.sampled_token_ids is None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-            model_input.last_sampled_token_ids = model_input.cached_outputs[
-                -1].sampled_token_ids
-            # free sampled token ids from the previous step if it has been
-            # pythonized. Cannot free the last sampled token ids because
-            # we need it for GPU advance_step.
-            for output in model_input.cached_outputs[:-1]:
-                if output.pythonized:
-                    output.sampled_token_ids = None
-        else:
-            # otherwise we need to get the cached sampled token ids from the
-            # execute_model_req
-            assert execute_model_req.last_sampled_token_ids is not None
-            model_input.last_sampled_token_ids = (
-                execute_model_req.last_sampled_token_ids.cuda())
-            model_input.add_sampler_output(
-                SamplerOutput(outputs=[], sampled_token_ids=None),
-                model_input.last_sampled_token_ids)
-
-            # free sampled token ids from the previous step.
-            # TODO(will) we could reuse the sampled token ids tensor from
-            # the previous step instead.
-            for output in model_input.cached_outputs[:-1]:
-                output.sampled_token_ids = None
-            assert model_input.cached_outputs[-1].sampled_token_ids is not None
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[StatefulModelInput, WorkerInput, Dict[str,
-                                                              torch.Tensor]]]:
-        """
-        Depending on the current state of the request and multi step worker,
-        this method may skip the normal _prepare_model_input and
-        _prepare_worker_input methods and instead used cached values.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-
-            virtual_engine = execute_model_req.virtual_engine
-            (model_input, worker_input,
-             kwargs) = self._get_driver_input_and_broadcast(execute_model_req)
-            assert isinstance(model_input, StatefulModelInput)
-            if execute_model_req.is_first_multi_step:
-                # cache the worker input and model input for the next steps
-                self.multi_step_states[virtual_engine] = MultiStepState(
-                    worker_input=worker_input, model_input=model_input)
-        # if TP workers
-        else:
-            broadcast_data = self._get_worker_input_from_broadcast()
-            # if the driver has sent an empty input, we should stop the worker
-            # loop
-            if broadcast_data is None:
-                return None
-            model_input, worker_input, kwargs = broadcast_data
-            assert isinstance(model_input, StatefulModelInput)
-            virtual_engine = worker_input.virtual_engine
-            if model_input.is_first_multi_step:
-                pass
-                # TODO(will) Can cache the worker input and model input for the
-                # next steps. See below for details
-            else:
-                # TODO(will) possible to also cache and reuse the cached worker
-                # input and model input. The idea is essentially the delta
-                # optimization for model_inputs. Where the TP workers can cache
-                # the model input states and we only broadcast the delta need
-                # for the next step (sampled_token_ids from the previous step)
-
-                assert isinstance(model_input, StatefulModelInput)
-                # we need to update the last sampled token ids in the model
-                # input for the workers so that they can run inplace
-                # advance_step
-                model_input.add_sampler_output(
-                    SamplerOutput(outputs=[], sampled_token_ids=None),
-                    model_input.last_sampled_token_ids)
-
-        assert model_input is not None
-        assert worker_input is not None
-        return model_input, worker_input, kwargs
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
deleted file mode 100644
index 7ccf1a2c0a87..000000000000
--- a/vllm/worker/neuron_model_runner.py
+++ /dev/null
@@ -1,460 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-
-from vllm.config import DeviceConfig, VllmConfig
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuron import get_neuron_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs)
-from vllm.platforms import current_platform
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
-from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-logger = init_logger(__name__)
-
-
-@dataclass(frozen=True)
-class ModelInputForNeuron(ModelRunnerInputBase):
-    """
-    Used by the NeuronModelRunner.
-    """
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    input_block_ids: Optional[torch.Tensor] = None
-    sampling_metadata: SamplingMetadata = None
-    multi_modal_kwargs: BatchedTensorInputs = None
-    adapter_ids: Optional[str] = None
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        return {
-            "input_tokens": self.input_tokens,
-            "input_positions": self.input_positions,
-            "input_block_ids": self.input_block_ids,
-            "sampling_metadata": self.sampling_metadata,
-            "multi_modal_kwargs": self.multi_modal_kwargs,
-        }
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls,
-        tensor_dict: Dict[str, Any],
-        attn_backend: Optional["AttentionBackend"] = None,
-    ) -> "ModelInputForNeuron":
-        return ModelInputForNeuron(
-            input_tokens=tensor_dict["input_tokens"],
-            input_positions=tensor_dict["input_positions"],
-            input_block_ids=tensor_dict["input_block_ids"],
-            sampling_metadata=tensor_dict["sampling_metadata"],
-            multi_modal_kwargs=tensor_dict["multi_modal_kwargs"],
-        )
-
-
-class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
-    """A model runner for AWS Neuron hardware"""
-
-    # NEURON has an upper limit on the top_k
-    _MAX_NEURON_SAMPLING_TOP_K = 256
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        ModelRunnerBase.__init__(self, vllm_config)
-
-        if (self.model_config is not None
-                and self.model_config.get_sliding_window()):
-            logger.warning("Sliding window is not supported on Neuron. "
-                           "The model will run without sliding window.")
-        self.device_config = (self.device_config if self.device_config
-                              is not None else DeviceConfig())
-        self.lora_config = vllm_config.lora_config
-        self.device = self.device_config.device
-        self.pin_memory = is_pin_memory_available()
-
-        # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
-            .create_input_mapper(self.model_config)
-
-        # Lazy initialization.
-        self.model: nn.Module  # initialize after load_model.
-
-        # Once NEURON_ON_DEVICE_SAMPLING_DISABLED is set to a non-zero value,
-        # turn off on-device sampling.
-        self._on_device_sampling_disabled = int(
-            os.getenv("NEURON_ON_DEVICE_SAMPLING_DISABLED", "0"))
-
-        # NEURON needs to update sampling parameters when request IDs change
-        # across batches. This variable stores the previous batch's request IDs
-        # to determine if an update is needed.
-        self._previous_batch_request_ids: List[str] = []
-
-        if not self._on_device_sampling_disabled:
-            self._init_neuron_sampling()
-
-    def _init_neuron_sampling(self) -> None:
-        if current_platform.use_transformers_neuronx():
-            from transformers_neuronx.config import GenerationConfig
-        else:
-            from transformers import GenerationConfig
-        logger.warning(
-            "On-device sampling is turned on in Neuron by default, only "
-            "top_k, top_p, and temperature are current supported sampling "
-            "parameters. To turn off the on-device sampling, please set "
-            "the environment variable NEURON_ON_DEVICE_SAMPLING_DISABLED=1.")
-        self.model_config.neuron_sampling_params = GenerationConfig(
-            max_length=self.scheduler_config.max_model_len,
-            do_sample=True,
-            per_batch_line=True,
-            top_k=[self._MAX_NEURON_SAMPLING_TOP_K] \
-                  * self.scheduler_config.max_num_seqs,
-            top_p=[1.0] * self.scheduler_config.max_num_seqs,
-            temperature=[1.0] * self.scheduler_config.max_num_seqs,
-            dynamic=True,
-            global_top_k=self._MAX_NEURON_SAMPLING_TOP_K)
-
-    def load_model(self) -> None:
-        self.model = get_neuron_model(self.model_config,
-                                      parallel_config=self.parallel_config,
-                                      scheduler_config=self.scheduler_config)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def _prepare_prompt(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int],
-               BatchedTensorInputs]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-
-        seq_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            assert seq_group_metadata.is_prompt
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            assert len(seq_ids) == 1
-            seq_id = seq_ids[0]
-
-            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
-            seq_len = len(prompt_tokens)
-            seq_lens.append(seq_len)
-
-            input_tokens.append(prompt_tokens)
-            input_positions.append(list(range(seq_len)))
-
-            assert seq_group_metadata.block_tables is not None
-            block_table = seq_group_metadata.block_tables[seq_id]
-            assert len(block_table) == 1
-            input_block_ids.append(block_table[0])
-
-            mm_kwargs = seq_group_metadata.multi_modal_data
-            if mm_kwargs:
-                mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs)
-                multi_modal_kwargs_list.append(mm_kwargs)
-
-        max_seq_len = max(seq_lens)
-        assert max_seq_len > 0
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=max_seq_len,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=max_seq_len,
-                                               dtype=torch.long,
-                                               device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return (input_tokens, input_positions, input_block_ids, seq_lens,
-                multi_modal_kwargs)
-
-    def _prepare_decode(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        input_block_ids: List[int] = []
-        context_lens: List[int] = []
-
-        for seq_group_metadata in seq_group_metadata_list:
-            assert not seq_group_metadata.is_prompt
-
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            for seq_id in seq_ids:
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
-
-                seq_len = seq_data.get_len()
-                position = seq_len - 1
-                input_positions.append([position])
-                context_lens.append(seq_len)
-
-                assert seq_group_metadata.block_tables is not None
-                block_table = seq_group_metadata.block_tables[seq_id]
-                assert len(block_table) == 1
-                input_block_ids.append(block_table[0])
-
-        input_tokens = make_tensor_with_pad(input_tokens,
-                                            pad=0,
-                                            max_len=1,
-                                            dtype=torch.long,
-                                            device=self.device)
-        input_positions = make_tensor_with_pad(input_positions,
-                                               pad=0,
-                                               max_len=1,
-                                               dtype=torch.long,
-                                               device=self.device)
-        context_lens = torch.tensor(context_lens,
-                                    dtype=torch.int,
-                                    device=self.device)
-        input_block_ids = torch.tensor(input_block_ids,
-                                       dtype=torch.long,
-                                       device=self.device)
-
-        return input_tokens, input_positions, input_block_ids
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self, tensor_dict: Dict[str, Any]) -> ModelInputForNeuron:
-        return ModelInputForNeuron.from_broadcasted_tensor_dict(tensor_dict)
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        multi_modal_kwargs = None
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        if current_platform.use_transformers_neuronx(
-        ) and not self._on_device_sampling_disabled:
-            # Once the request IDs are changed in current iteration, we will
-            # update the on-device sampling parameters.
-            current_batch_request_ids = [
-                seq_group_meta_data.request_id
-                for seq_group_meta_data in seq_group_metadata_list
-            ]
-            if current_batch_request_ids != self._previous_batch_request_ids:
-                self._update_neuron_sampling_params(seq_group_metadata_list)
-                self._previous_batch_request_ids = current_batch_request_ids
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs)
-
-    def _update_neuron_sampling_params(
-            self, seq_group_metadata_list: List[SequenceGroupMetadata]):
-        # Update Neuron sampling parameters (GenerationConfig in Neuron)
-        current_sampling_params = self.model_config.neuron_sampling_params
-        assert current_sampling_params is not None, (
-            f"Failed to update sampling_params, "
-            f"current sampling params is {current_sampling_params}")
-
-        is_update_needed = False
-
-        top_k = current_sampling_params.top_k
-        top_p = current_sampling_params.top_p
-        temperature = current_sampling_params.temperature
-
-        # The index of a sequence's sampling parameters in neuron is equal to
-        # its index in `input_block_ids`.
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            sampling_params = seq_group_metadata.sampling_params
-
-            seq_group_top_k = sampling_params.top_k
-            seq_group_top_p = sampling_params.top_p
-            seq_group_temperature = sampling_params.temperature
-
-            for seq_id in seq_ids:
-                index = seq_group_metadata.block_tables[seq_id][0]
-                if (top_k[index] != seq_group_top_k
-                        or top_p[index] != seq_group_top_p
-                        or temperature[index] != seq_group_temperature):
-                    is_update_needed = True
-
-                top_k[index] = seq_group_top_k
-                top_p[index] = seq_group_top_p
-                temperature[index] = seq_group_temperature
-
-        # update_generation_config is only available in transformers-neuronx
-        if is_update_needed and current_platform.use_transformers_neuronx():
-            self.model.model.update_generation_config(current_sampling_params)
-
-    def _convert_to_neuron_sampling_params(
-            self, sampling_params: SamplingParams) -> Tuple[int, float, float]:
-        # Returns the top_k, top_p and temperature parameters for neuron.
-        top_k = sampling_params.top_k
-        top_p = sampling_params.top_p
-        temperature = sampling_params.temperature
-
-        if temperature == 0.0:
-            # Enable greedy sampling on zero temperature
-            return (1, 1.0, 1.0)
-        if top_k < 1 or top_k > self._MAX_NEURON_SAMPLING_TOP_K:
-            top_k = self._MAX_NEURON_SAMPLING_TOP_K
-
-        return (top_k, top_p, temperature)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        # extract top_k, top_p and temperature from model_input for neuron
-        # forward call
-        sampling_params = (torch.tensor([[
-            seq_group.sampling_params.top_k, seq_group.sampling_params.top_p,
-            seq_group.sampling_params.temperature
-        ] for seq_group in model_input.sampling_metadata.seq_groups]))
-
-        if current_platform.use_neuronx_distributed():
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                sampling_params=sampling_params,
-                adapter_ids=model_input.adapter_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-        elif current_platform.use_transformers_neuronx():
-            # [TODO] validate on-device sampling
-            # The model signature may need change for on-device sampling
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(
-                    model_input.multi_modal_kwargs or {},
-                    device=self.device,
-                ),
-            )
-
-        # Compute the logits only if the on-device sampling is turned off as
-        # on-device sampling outputs the token ids.
-        if self._on_device_sampling_disabled:
-            logits = self.model.compute_logits(hidden_states,
-                                               model_input.sampling_metadata)
-        else:
-            logits = hidden_states
-
-        # Sample the next token.
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        return [output]
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_config.get_vocab_size()
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # this is a no-op for NeuronModelRunner
-        return mm_data
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def add_lora(self, lora_request: LoRARequest):
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "LoRAs are not supported for Transformers NeuronX framework")
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
deleted file mode 100644
index 4e1408300fb8..000000000000
--- a/vllm/worker/neuron_worker.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A Neuron worker class."""
-import os
-from typing import List, Optional, Set, Tuple
-
-import torch.distributed
-
-from vllm.config import VllmConfig
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class NeuronWorker(LocalOrDistributedWorkerBase):
-    """A worker class that executes the model on a group of neuron cores.
-    """
-
-    model_runner: NeuronModelRunner
-
-    def __init__(self,
-                 vllm_config: VllmConfig,
-                 local_rank: int,
-                 rank: int,
-                 distributed_init_method: str,
-                 is_driver_worker: bool = False) -> None:
-        WorkerBase.__init__(self, vllm_config=vllm_config)
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        self.lora_config = vllm_config.lora_config
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        neuron_framework = current_platform.get_neuron_framework_to_use()
-        if neuron_framework == NeuronFramework.TRANSFORMERS_NEURONX:
-            self.model_runner = self.get_tnx_model_runner(vllm_config)
-        elif neuron_framework == NeuronFramework.NEURONX_DISTRIBUTED_INFERENCE:
-            self.model_runner = self.get_neuronx_distributed_model_runner(
-                vllm_config)
-        else:
-            raise NotImplementedError(
-                "Specified framework" +
-                f" {os.environ.get('VLLM_NEURON_FRAMEWORK')}" +
-                " is either not installed or not supported." +
-                " Supported frameworks: " +
-                "[transformers-neuronx, neuronx-distributed-inference]")
-
-    def get_tnx_model_runner(self, vllm_config):
-        assert (self.lora_config
-                is None), ("LoRA is not supported for TransformersNeuronX "
-                           "framework.")
-        from vllm.worker.multi_step_neuron_model_runner import (
-            MultiStepNeuronModelRunner)
-        if self.speculative_config is not None:
-            return MultiStepNeuronModelRunner(vllm_config=vllm_config)
-        else:
-            return NeuronModelRunner(vllm_config=vllm_config)
-
-    def get_neuronx_distributed_model_runner(self, vllm_config):
-        from vllm.worker.multi_step_neuronx_distributed_model_runner import (
-            MultiStepNeuronxDistributedModelRunner)
-        from vllm.worker.neuronx_distributed_model_runner import (
-            NeuronxDistributedModelRunner)
-        if self.speculative_config is not None:
-            assert (self.lora_config
-                    is None), "LoRA is not supported for Speculative Decoding"
-            return MultiStepNeuronxDistributedModelRunner(
-                vllm_config=vllm_config)
-        else:
-            return NeuronxDistributedModelRunner(vllm_config=vllm_config)
-
-    def init_device(self) -> None:
-        self.init_distributed_environment()
-
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available KV blocks.
-
-        Swapping is not yet supported, so always return num_cpu_blocks=0.
-
-        We configure num_gpu_blocks to be equal to max_num_seqs.
-        """
-        # Set the number of GPU blocks to be the same as the maximum number of
-        # sequences that can be processed in a single batch. This is equivalent
-        # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
-
-        # Swap not yet supported with Neuron backend.
-        num_cpu_blocks = 0
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache.
-        """
-
-        # Different values are not tested.
-        assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return False
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return None
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        return WorkerInput(num_seq_groups=len(
-            execute_model_req.seq_group_metadata_list), )
-
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        pass
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Determine the size in bytes of a cache block.
-
-        This is required for speculative decoding; it is not yet implemented.
-        """
-        raise NotImplementedError
-
-    def init_distributed_environment(self):
-        """Neuron uses transformers-neuronx for tensor parallelism.
-
-        vLLM still needs the environment initialized when TP/PP > 1
-        """
-        init_distributed_environment(
-            world_size=1,
-            rank=self.rank,
-            local_rank=self.local_rank,
-            distributed_init_method=self.distributed_init_method,
-            backend=current_platform.dist_backend,
-        )
-
-        ensure_model_parallel_initialized(
-            1,
-            1,
-        )
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        if current_platform.use_transformers_neuronx():
-            raise NotImplementedError(
-                f"{type(self)} does not support LoRA with Neuron Framework "
-                f"Transformers NeuronX")
-        return self.model_runner.list_loras()
diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py
deleted file mode 100644
index 2a0f4e77c99e..000000000000
--- a/vllm/worker/neuronx_distributed_model_runner.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import List, Optional, Set
-
-import torch
-from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import (
-    get_all_supported_aspect_ratios)
-from neuronx_distributed_inference.modules.generation.sampling import (
-    prepare_sampling_params)
-from neuronx_distributed_inference.modules.lora_serving import (
-    LoraCheckpoint, LoraServingConfig)
-
-from vllm.config import VllmConfig
-from vllm.entrypoints.openai.serving_models import LoRAModulePath
-from vllm.logger import init_logger
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.neuronx_distributed import (
-    _get_model_architecture, get_neuron_model)
-from vllm.multimodal import MultiModalKwargs
-from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import (ModelInputForNeuron,
-                                             NeuronModelRunner)
-
-logger = init_logger(__name__)
-
-
-class NeuronxDistributedModelRunner(NeuronModelRunner):
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ):
-        super().__init__(vllm_config)
-        self.lora_checkpoint = None
-        self.model = None
-        self.lora_serving_config = None
-
-    @staticmethod
-    def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]):
-        if not lora_modules:
-            return None
-        return {_.get("name"): _.get("path") for _ in lora_modules}
-
-    def _get_nxdi_lora_config(self):
-        override_neuron_config = self.model_config.override_neuron_config
-        lora_modules = override_neuron_config.pop("lora_modules", None)
-        target_modules = override_neuron_config.pop("target_modules", None)
-        lora_ckpt_paths = self._get_lora_paths_strings(lora_modules)
-        if self.lora_config.max_loras < len(lora_ckpt_paths):
-            raise ValueError(
-                "Number of LoRAs (%s) exceeds maximum "
-                "allowed (%s)", len(lora_ckpt_paths),
-                self.lora_config.max_loras)
-
-        return LoraServingConfig(
-            max_loras=self.lora_config.max_loras,
-            max_lora_rank=self.lora_config.max_lora_rank,
-            target_modules=target_modules,
-            lora_ckpt_paths=lora_ckpt_paths,
-        )
-
-    def load_model(self) -> None:
-        # Update LoRA config
-        if self.lora_config is not None:
-            self.lora_serving_config = self._get_nxdi_lora_config()
-            self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config)
-        self.model = get_neuron_model(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
-            lora_serving_config=self.lora_serving_config)
-
-    def get_nxd_sampling_params(self, sampling_metadata):
-        if self.model.config.neuron_config.on_device_sampling_config:
-            max_topk = (self.model.config.neuron_config.
-                        on_device_sampling_config.global_topk)
-        else:
-            max_topk = self.model.config.vocab_size
-
-        top_k = [1] * self.scheduler_config.max_num_seqs
-        top_p = [1.0] * self.scheduler_config.max_num_seqs
-        temperature = [1.0] * self.scheduler_config.max_num_seqs
-
-        for index, sequenceGroupToSample in enumerate(
-                sampling_metadata.seq_groups):
-            top_k[index] = (sequenceGroupToSample.sampling_params.top_k
-                            if sequenceGroupToSample.sampling_params.top_k > 0
-                            else max_topk)
-            top_p[index] = sequenceGroupToSample.sampling_params.top_p
-            temperature[index] = (
-                sequenceGroupToSample.sampling_params.temperature)
-
-        sampling_params = prepare_sampling_params(
-            batch_size=self.scheduler_config.max_num_seqs,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature)
-        return sampling_params
-
-    def get_multi_modal_data_neuron(self, input_images):
-        raise NotImplementedError("need to restore multi-modal support")
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForNeuron,
-        kv_caches: Optional[List[torch.Tensor]] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[List[SamplerOutput]]:
-        if num_steps > 1:
-            raise ValueError(
-                "NeuronModelRunner does not support multi-step execution.")
-
-        if _get_model_architecture(
-                self.model.config) != "MllamaForConditionalGeneration":
-            return super().execute_model(model_input, kv_caches,
-                                         intermediate_tensors, num_steps)
-
-        sampling_params = self.get_nxd_sampling_params(
-            model_input.sampling_metadata)
-
-        if model_input.multi_modal_kwargs.get('pixel_values') is not None:
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=model_input.multi_modal_kwargs.get(
-                    'pixel_values'),
-                aspect_ratios=model_input.multi_modal_kwargs.get(
-                    'aspect_ratios'),
-                sampling_params=sampling_params,
-                num_chunks=model_input.multi_modal_kwargs.get('num_chunks'),
-                has_image=model_input.multi_modal_kwargs.get(
-                    'has_image').squeeze(1),
-            )
-        else:
-            bs = model_input.input_tokens.shape[0] if (model_input.input_tokens
-                                                       is not None) else 1
-            empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560],
-                                             dtype=torch.bfloat16)
-            empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64)
-            num_chunks = torch.zeros((bs, 1), dtype=torch.int32)
-            has_image = torch.zeros([bs], dtype=torch.int32)
-            hidden_states = self.model(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                seq_ids=model_input.input_block_ids,
-                pixel_values=empty_pixel_values,
-                aspect_ratios=empty_aspect_ratios,
-                sampling_params=sampling_params,
-                num_chunks=num_chunks,
-                has_image=has_image,
-            )
-
-        output = self.model.sample(
-            hidden_states=hidden_states,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-
-        return [output]
-
-    def process_multi_modal_data_neuron(self, mm_data):
-        # Neuron uses aspect_ratios instead of aspect_ratio_ids
-        all_supported_aspect_ratios = get_all_supported_aspect_ratios(
-            self.model.config.vision_config.max_num_tiles)
-        aspect_ratio_ids = mm_data.get("aspect_ratio_ids")
-        mm_data["aspect_ratios"] = torch.tensor(
-            all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0)
-
-        # Neuron's num_chunks is HF's num_tiles
-        mm_data["num_chunks"] = mm_data.get("num_tiles")
-
-        # Input has an image if it has pixel_values
-        bs = mm_data["num_chunks"].shape[0]
-        pixel_values = mm_data.get("pixel_values")
-        if pixel_values is not None and not torch.all(pixel_values == 0):
-            mm_data["has_image"] = torch.ones(bs)
-
-        else:
-            mm_data["has_image"] = torch.zeros(bs)
-        return mm_data
-
-    def _get_lora_adapter_ids(self, seq_group_metadata_list):
-        # set LoRA adapter IDs for multi-lora serving
-        batch_size = len(seq_group_metadata_list)
-        if self.lora_checkpoint is not None:
-            # "0" indicates NxDI to use the base model for inference
-            adapter_ids = ["0"] * batch_size
-            for idx, seq_group_metadata in enumerate(seq_group_metadata_list):
-                if seq_group_metadata.lora_request is not None:
-                    adapter_ids[
-                        idx] = seq_group_metadata.lora_request.lora_name
-
-            # convert adapter_ids from strings to integers
-            adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices(
-                adapter_ids, batch_size)
-        else:
-            adapter_ids = torch.zeros((batch_size), dtype=torch.int32)
-
-        return adapter_ids
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForNeuron:
-        # NOTE: We assume that all sequences in the group are all prompts or
-        # all decodes.
-        is_prompt = seq_group_metadata_list[0].is_prompt
-        # Prepare input tensors.
-        if is_prompt:
-            (input_tokens, input_positions, input_block_ids, seq_lens,
-             multi_modal_kwargs
-             ) = self._prepare_prompt(seq_group_metadata_list)
-        else:
-            (input_tokens, input_positions,
-             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
-            seq_lens = None
-
-        if not self._on_device_sampling_disabled:
-            for seq_group_metadata in seq_group_metadata_list:
-                sampling_params = seq_group_metadata.sampling_params
-                top_k, top_p, temperature = (
-                    self._convert_to_neuron_sampling_params(sampling_params))
-                sampling_params.top_k = top_k
-                sampling_params.top_p = top_p
-                sampling_params.temperature = temperature
-
-        # we need multi_modal_data for later tokens as well
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        for seq_group_metadata in seq_group_metadata_list:
-            mm_data = seq_group_metadata.multi_modal_data
-            if mm_data:
-                multi_modal_kwargs_list.append(mm_data)
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            # query_lens is not needed if chunked prefill is not
-            # supported. Since neuron worker doesn't support chunked prefill
-            # just use seq_lens instead.
-            seq_lens,
-            self.device,
-            self.pin_memory,
-            generators=self.get_generators(finished_requests_ids))
-
-        return ModelInputForNeuron(input_tokens=input_tokens,
-                                   input_positions=input_positions,
-                                   input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata,
-                                   multi_modal_kwargs=multi_modal_kwargs,
-                                   adapter_ids=lora_adapter_ids)
-
-    def remove_all_loras(self):
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def set_active_loras(self, lora_requests: Set[LoRARequest],
-                         lora_mapping: LoRAMapping) -> None:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def add_lora(self, lora_request: LoRARequest):
-        logger.warning(
-            "Adding LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config. If you supplied "
-            "the parameter, you can ignore this warning. Ignoring"
-            "lora request: ", lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError(
-            "Managing LoRAs is only supported through the "
-            "lora_modules parameter in override_neuron_config")
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
deleted file mode 100644
index e49783ad9b24..000000000000
--- a/vllm/worker/pooling_model_runner.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.models.interfaces_base import VllmModelForPooling
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalKwargs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
-                                      ModelInputForGPUBuilder)
-
-logger = init_logger(__name__)
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
-    """
-    Used by the PoolingModelRunner.
-    """
-    pooling_metadata: Optional["PoolingMetadata"] = None
-
-
-class PoolingModelRunner(
-        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
-    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
-        ModelInputForGPUWithPoolingMetadata)
-    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-    ):
-        super().__init__(vllm_config=vllm_config,
-                         kv_cache_dtype=kv_cache_dtype,
-                         is_driver_worker=is_driver_worker)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForGPUWithPoolingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "PoolingModelRunner does not support multi-step execution.")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        assert model_input.attn_metadata is not None
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        decode_meta = model_input.attn_metadata.decode_metadata
-        virtual_engine = model_input.virtual_engine
-        # Pooling models are (ab-)used also to integrate non text models that
-        # are not autoregressive (PrithviGeosaptialMAE).
-        # These model might not use attention and do not really have a prefill
-        # and decode phase. The model input is processed in one shot and both
-        # decode_metadata and prefill_metadata would be None for such models.
-        # See the PlaceholderAttentionMetadata class.
-        # TODO: Figure out if cuda_graph is of any use for these models and
-        #  explore how to leverage it.
-        if (prefill_meta is None and decode_meta is not None
-                and decode_meta.use_cuda_graph):
-            if model_input.inputs_embeds is None:
-                assert model_input.input_tokens is not None
-                graph_batch_size = model_input.input_tokens.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, False)])
-            else:
-                graph_batch_size = model_input.inputs_embeds.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, True)])
-        else:
-            model_executable = self.model
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.cuda.Event(enable_timing=True)
-            model_forward_end = torch.cuda.Event(enable_timing=True)
-            model_forward_start.record()
-
-        cross_enc_kwargs = {}
-        if model_input.token_types is not None:
-            cross_enc_kwargs["token_type_ids"] = model_input.token_types
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(
-                    multi_modal_kwargs,
-                    device=self.device,
-                ),
-                **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs,
-            )
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end.record()
-
-        # Only perform pooling in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            if (self.is_driver_worker
-                    and hidden_or_intermediate_states is not None
-                    and isinstance(hidden_or_intermediate_states,
-                                   IntermediateTensors)
-                    and self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                hidden_or_intermediate_states.tensors["model_forward_time"] = (
-                    torch.tensor(model_forward_time + orig_model_forward_time))
-            return hidden_or_intermediate_states
-
-        # Only perform pooling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        return [
-            self.model.pooler(hidden_states=hidden_or_intermediate_states,
-                              pooling_metadata=model_input.pooling_metadata)
-        ]
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForGPUWithPoolingMetadata:
-        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForGPUWithPoolingMetadata:
-        assert seq_group_metadata_list is not None
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Prepare PoolingMetadata.
-        assert model_input.seq_lens is not None
-        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                 model_input.seq_lens)
-
-        return dataclasses.replace(model_input,
-                                   pooling_metadata=pooling_metadata)
-
-    def _prepare_pooling(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-    ) -> PoolingMetadata:
-        """Prepare PoolingMetadata for the sequence group metadata list."""
-        seq_groups: List[Tuple[List[int], PoolingParams]] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            pooling_params = seq_group_metadata.pooling_params
-            assert pooling_params is not None
-            assert (task := pooling_params.task) is not None, (
-                "You did not set `task` in the API")
-
-            model = cast(VllmModelForPooling, self.model)
-            to_update = model.pooler.get_pooling_updates(task)
-            to_update.apply(pooling_params)
-
-            seq_groups.append((seq_ids, pooling_params))
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        pooling_metadata = PoolingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-        )
-
-        return pooling_metadata
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
deleted file mode 100644
index 512a1dca7370..000000000000
--- a/vllm/worker/utils.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''
-Worker-related helper functions.
-'''
-
-from vllm.utils import STR_NOT_IMPL_ENC_DEC_ERR_STRS
-from vllm.worker.model_runner import GPUModelRunnerBase
-
-
-def assert_enc_dec_mr_supported_scenario(
-        enc_dec_mr: GPUModelRunnerBase) -> None:
-    '''
-    Asserted that the provided encoder/decoder model runner instance reflects
-    a supported scenario.
-    '''
-
-    # Reminder: Please update docs/features/compatibility_matrix.md
-    # If the feature combo become valid
-
-    if enc_dec_mr.cache_config.enable_prefix_caching:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
-
-    if enc_dec_mr.sliding_window is not None:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SWA'])
-
-    if enc_dec_mr.scheduler_config.chunked_prefill_enabled:
-        raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_ERR_STRS[
-            'STR_NOT_IMPL_ENC_DEC_CHUNKED_PREFILL'])
-
-    if getattr(enc_dec_mr.model_config.hf_config, 'attn_logit_softcapping',
-               None) is not None:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP']
-        )
-
-    if enc_dec_mr.lora_config is not None:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_LORA'])
-
-    if enc_dec_mr.parallel_config.pipeline_parallel_size > 1:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
-
-    if enc_dec_mr.scheduler_config.num_lookahead_slots > 0:
-        raise NotImplementedError(
-            STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_SPEC_DEC'])
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
deleted file mode 100644
index 9dfea947568d..000000000000
--- a/vllm/worker/worker.py
+++ /dev/null
@@ -1,587 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""A GPU worker class."""
-import gc
-import os
-from typing import Dict, List, Optional, Set, Tuple, Type, Union
-
-import torch
-import torch.distributed
-
-import vllm.envs as envs
-from vllm.attention.layer import Attention
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.device_allocator.cumem import CuMemAllocator
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment,
-                              set_custom_all_reduce)
-from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.platforms import current_platform
-from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
-                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
-                        memory_profiling)
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.pooling_model_runner import PoolingModelRunner
-from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
-                                     WorkerInput)
-
-logger = init_logger(__name__)
-
-
-class Worker(LocalOrDistributedWorkerBase):
-    """A worker class that executes (a partition of) the model on a GPU.
-
-    Each worker is associated with a single GPU. The worker is responsible for
-    maintaining the KV cache and executing the model on the GPU. In case of
-    distributed inference, each worker is assigned a partition of the model.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        is_driver_worker: bool = False,
-        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
-    ) -> None:
-        WorkerBase.__init__(self, vllm_config)
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-
-        # Return hidden states from target model if the draft model is an
-        # mlp_speculator
-        speculative_config = self.speculative_config
-        model_config = self.model_config
-        speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.hf_config.model_type ==
-                model_config.hf_config.model_type) \
-            or (speculative_config.draft_model_config.hf_config.model_type
-                not in ("medusa",
-                        "mlp_speculator",
-                        "eagle",
-                        "deepseek_mtp",
-                        "glm4_moe_mtp",
-                        "mimo_mtp")) \
-                    else {"return_hidden_states": True}
-
-        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_config.runner_type == "pooling":
-            ModelRunnerClass = PoolingModelRunner
-        elif self.model_config.is_encoder_decoder:
-            ModelRunnerClass = EncoderDecoderModelRunner
-        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
-            vllm_config=self.vllm_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker,
-            **speculative_args,
-        )
-        if model_runner_cls is not None:
-            self.model_runner = model_runner_cls(self.model_runner)
-
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as pooling models don't initialize kv_caches
-        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
-        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
-
-        # Buffers saved before sleep
-        self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
-
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                with_stack=True,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True))
-        else:
-            self.profiler = None
-
-    def start_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.start()
-
-    def stop_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.stop()
-        print(
-            self.profiler.key_averages().table(sort_by="self_cuda_time_total"))
-
-    def sleep(self, level: int = 1) -> None:
-        free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
-
-        # Save the buffers before level 2 sleep
-        if level == 2:
-            model = self.model_runner.model
-            self._sleep_saved_buffers = {
-                name: buffer.cpu().clone()
-                for name, buffer in model.named_buffers()
-            }
-
-        allocator = CuMemAllocator.get_instance()
-        allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
-        free_bytes_after_sleep, total = torch.cuda.mem_get_info()
-        freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
-        used_bytes = total - free_bytes_after_sleep
-        assert freed_bytes >= 0, "Memory usage increased after sleeping."
-        logger.info(
-            "Sleep mode freed %.2f GiB memory, "
-            "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
-            used_bytes / GiB_bytes)
-
-    def wake_up(self, tags: Optional[list[str]] = None) -> None:
-        allocator = CuMemAllocator.get_instance()
-        allocator.wake_up(tags=tags)
-
-        # Restore the buffers after level 2 sleep
-        if len(self._sleep_saved_buffers):
-            model = self.model_runner.model
-            for name, buffer in model.named_buffers():
-                if name in self._sleep_saved_buffers:
-                    buffer.data.copy_(self._sleep_saved_buffers[name].data)
-            self._sleep_saved_buffers = {}
-
-    def init_device(self) -> None:
-        if self.device_config.device.type == "cuda":
-            # torch.distributed.all_reduce does not free the input tensor until
-            # the synchronization point. This causes the memory usage to grow
-            # as the number of all_reduce calls increases. This env var disables
-            # this behavior.
-            # Related issue:
-            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
-            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-
-            # This env var set by Ray causes exceptions with graph building.
-            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-            self.device = torch.device(f"cuda:{self.local_rank}")
-            torch.cuda.set_device(self.device)
-
-            _check_if_gpu_supports_dtype(self.model_config.dtype)
-            gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
-            self.baseline_snapshot = MemorySnapshot()
-        else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
-        # Initialize the distributed environment.
-        init_worker_distributed_environment(self.vllm_config, self.rank,
-                                            self.distributed_init_method,
-                                            self.local_rank)
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        if self.vllm_config.model_config.enable_sleep_mode:
-            allocator = CuMemAllocator.get_instance()
-            assert allocator.get_current_usage() == 0, (
-                "Sleep mode can only be "
-                "used for one instance per process.")
-            context = allocator.use_memory_pool(tag="weights")
-        else:
-            from contextlib import nullcontext
-            context = nullcontext()
-        with context:
-            self.model_runner.load_model()
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self.model_runner.save_sharded_state(
-            path,
-            pattern=pattern,
-            max_size=max_size,
-        )
-
-    def save_tensorized_model(
-        self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-        self.model_runner.save_tensorized_model(
-            tensorizer_config=tensorizer_config, )
-
-    @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
-
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
-
-        Tip:
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
-        torch.cuda.empty_cache()
-        torch.cuda.reset_peak_memory_stats()
-
-        free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        with memory_profiling(
-                self.baseline_snapshot,
-                weights_memory=self.model_runner.model_memory_usage) as result:
-            self.model_runner.profile_run()
-
-        self._assert_memory_footprint_increased_during_profiling()
-
-        memory_for_current_instance = total_gpu_memory * \
-            self.cache_config.gpu_memory_utilization
-        available_kv_cache_memory = (memory_for_current_instance -
-                                     result.non_kv_cache_memory)
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        cache_block_size = self.get_cache_block_size_bytes()
-        if cache_block_size == 0:
-            num_gpu_blocks = 0
-            num_cpu_blocks = 0
-        else:
-            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
-            num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                                 cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-
-        msg = (f"Memory profiling takes {result.profile_time:.2f} seconds\n"
-               "the current vLLM instance can use "
-               "total_gpu_memory "
-               f"({(total_gpu_memory / GiB_bytes):.2f}GiB)"
-               " x gpu_memory_utilization "
-               f"({self.cache_config.gpu_memory_utilization:.2f})"
-               f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n"
-               "model weights take "
-               f"{(result.weights_memory / GiB_bytes):.2f}GiB;"
-               " non_torch_memory takes "
-               f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;"
-               " PyTorch activation peak memory takes "
-               f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;"
-               " the rest of the memory reserved for KV Cache is "
-               f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.")
-
-        logger.info(msg)
-        # Final cleanup
-        gc.collect()
-
-        return num_gpu_blocks, num_cpu_blocks
-
-    def _assert_memory_footprint_increased_during_profiling(self):
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # GPU did not change their memory usage during the profiling.
-        free_gpu_memory, total = torch.cuda.mem_get_info()
-        cuda_memory = total - free_gpu_memory
-        assert self.baseline_snapshot.cuda_memory < cuda_memory, (
-            "Error in memory profiling. "
-            f"Initial used memory {self.baseline_snapshot.cuda_memory}, "
-            f"currently used memory {cuda_memory}. "
-            f"This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Allocate GPU and CPU KV cache with the specified number of blocks.
-
-        This also warms up the model, which may record CUDA graphs.
-        """
-        raise_if_cache_size_invalid(
-            num_gpu_blocks, self.cache_config.block_size,
-            self.cache_config.is_attention_free,
-            self.model_config.max_model_len,
-            self.parallel_config.pipeline_parallel_size)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        if self.vllm_config.model_config.enable_sleep_mode:
-            allocator = CuMemAllocator.get_instance()
-            context = allocator.use_memory_pool(tag="kv_cache")
-        else:
-            from contextlib import nullcontext
-            context = nullcontext()
-        with context:
-            self._init_cache_engine()
-        self._warm_up_model()
-
-    def _init_cache_engine(self):
-        assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = [
-            CacheEngine(self.cache_config, self.model_config,
-                        self.parallel_config, self.device_config)
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        self.gpu_cache = [
-            self.cache_engine[ve].gpu_cache
-            for ve in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-        # Layer pairings for cross-layer KV sharing.
-        # If an Attention layer `layer_name` is in the keys of this dict, it
-        # means this layer will perform attention using the keys and values
-        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
-        shared_kv_cache_layers: dict[str, str] = {}
-
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
-
-        for layer_name, attn_module in attn_layers.items():
-            if (kv_tgt_layer :=
-                    attn_module.kv_sharing_target_layer_name) is not None:
-                # The layer doesn't need its own KV cache and will use that of
-                # the target layer. We skip creating a KVCacheSpec for it, so
-                # that KV cache management logic will act as this layer does
-                # not exist, and doesn't allocate KV cache for the layer. This
-                # enables the memory saving of cross-layer kv sharing, allowing
-                # a given amount of memory to accommodate longer context lengths
-                # or enable more requests to be processed simultaneously.
-                shared_kv_cache_layers[layer_name] = kv_tgt_layer
-
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.gpu_cache, shared_kv_cache_layers)
-
-    def _warm_up_model(self) -> None:
-        # warm up sizes that are not in cudagraph capture sizes,
-        # but users still want to compile for better performance,
-        # e.g. for the max-num-batched token size in chunked prefill.
-        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
-        if not self.model_config.enforce_eager:
-            warmup_sizes = [
-                x for x in warmup_sizes if x not in
-                self.vllm_config.compilation_config.cudagraph_capture_sizes
-            ]
-        for size in sorted(warmup_sizes, reverse=True):
-            logger.info("Compile and warming up model for size %d", size)
-            self.model_runner._dummy_run(size)
-        if not self.model_config.enforce_eager:
-            self.model_runner.capture_model(self.gpu_cache)
-        # Reset the seed to ensure that the random state is not affected by
-        # the model initialization and profiling.
-        set_random_seed(self.model_config.seed)
-
-    @property
-    def do_metadata_broadcast(self) -> bool:
-        return self.parallel_config.tensor_parallel_size > 1
-
-    @property
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return self.gpu_cache
-
-    @torch.inference_mode()
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        virtual_engine = execute_model_req.virtual_engine
-        num_steps = execute_model_req.num_steps
-        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
-        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
-        # they contain parameters to launch cudamemcpyasync.
-        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
-                                         device="cpu",
-                                         dtype=torch.int64).view(-1, 2)
-        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-        # `blocks_to_copy` is a gpu tensor. The src and tgt of
-        # blocks to copy are in the same device, and `blocks_to_copy`
-        # can be used directly within cuda kernels.
-        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                      device=self.device,
-                                      dtype=torch.int64).view(-1, 2)
-
-        return WorkerInput(
-            num_seq_groups=num_seq_groups,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            virtual_engine=virtual_engine,
-            num_steps=num_steps,
-        )
-
-    @torch.inference_mode()
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        virtual_engine = worker_input.virtual_engine
-        # Issue cache operations.
-        if (worker_input.blocks_to_swap_in is not None
-                and worker_input.blocks_to_swap_in.numel() > 0):
-            self.cache_engine[virtual_engine].swap_in(
-                worker_input.blocks_to_swap_in)
-        if (worker_input.blocks_to_swap_out is not None
-                and worker_input.blocks_to_swap_out.numel() > 0):
-            self.cache_engine[virtual_engine].swap_out(
-                worker_input.blocks_to_swap_out)
-        if (worker_input.blocks_to_copy is not None
-                and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
-
-    def _get_cached_seq_group_metadata(
-            self,
-            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
-                                                SequenceGroupMetadataDelta]],
-            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
-        """Return a list of cached Sequence Group Metadata after updating its
-        state.
-
-        It is used because scheduler only sends delta to workers to reduce
-        the data payload size. The function also cleans up cache based on
-        a given `finished_request_ids`.
-        """
-        new_seq_group_metadata_list = []
-        for metadata_or_delta in seq_group_metadata_list:
-            request_id = metadata_or_delta.request_id
-            if request_id not in self._seq_group_metadata_cache:
-                # The first prefill.
-                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                self._seq_group_metadata_cache[request_id] = metadata_or_delta
-            else:
-                # The first prefill is already cached.
-                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
-                    self._seq_group_metadata_cache[request_id].apply_delta(
-                        metadata_or_delta)
-                else:
-                    # If metadata snapshot is sent again, it is
-                    # preempted. Reset the cache because we need to start
-                    # from scratch.
-                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                    self._seq_group_metadata_cache[
-                        request_id] = metadata_or_delta
-
-            new_seq_group_metadata_list.append(
-                self._seq_group_metadata_cache[request_id])
-
-        # Clean up finished ids
-        for finished_id in finished_request_ids:
-            del self._seq_group_metadata_cache[finished_id]
-
-        return new_seq_group_metadata_list
-
-    def _execute_model_spmd(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Optional[List[SamplerOutput]]:
-        if execute_model_req is not None:
-            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
-                execute_model_req.seq_group_metadata_list,
-                execute_model_req.finished_requests_ids)
-
-            execute_model_req.seq_group_metadata_list = (
-                new_seq_group_metadata_list)
-        output = super()._execute_model_spmd(execute_model_req,
-                                             intermediate_tensors)
-        return output
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_runner.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_runner.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_runner.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.model_runner.list_loras()
-
-    @property
-    def max_model_len(self) -> int:
-        return self.model_config.max_model_len
-
-    @property
-    def vocab_size(self) -> int:
-        return self.model_runner.vocab_size
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Get the size of the KV cache block size in bytes.
-        """
-        return CacheEngine.get_cache_block_size(self.cache_config,
-                                                self.model_config,
-                                                self.parallel_config)
-
-
-def init_worker_distributed_environment(
-    vllm_config: VllmConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-    local_rank: int = -1,
-) -> None:
-    """Initialize the distributed environment."""
-    parallel_config = vllm_config.parallel_config
-    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
-
-    init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank,
-                                 current_platform.dist_backend)
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
-
-    ensure_kv_transfer_initialized(vllm_config)
-
-
-def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
-    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:  # noqa: SIM102
-        if not current_platform.has_device_capability(80):
-            capability = current_platform.get_device_capability()
-            gpu_name = current_platform.get_device_name()
-
-            if capability is None:
-                compute_str = "does not have a compute capability"
-            else:
-                version_str = capability.as_version_str()
-                compute_str = f"has compute capability {version_str}"
-
-            raise ValueError(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the "
-                "`dtype` flag in CLI, for example: --dtype=half.")
-
-
-def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
-                                max_model_len, pipeline_parallel_size) -> None:
-    if is_attention_free and num_gpu_blocks != 0:
-        raise ValueError("No memory should be allocated for the cache blocks "
-                         f"for an attention-free model, but {num_gpu_blocks} "
-                         "blocks are allocated.")
-    if not is_attention_free and num_gpu_blocks <= 0:
-        raise ValueError("No available memory for the cache blocks. "
-                         "Try increasing `gpu_memory_utilization` when "
-                         "initializing the engine.")
-    max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
-    if not is_attention_free and max_model_len > max_seq_len:
-        raise ValueError(
-            f"The model's max seq len ({max_model_len}) "
-            "is larger than the maximum number of tokens that can be "
-            f"stored in KV cache ({max_seq_len}). Try increasing "
-            "`gpu_memory_utilization` or decreasing `max_model_len` when "
-            "initializing the engine.")
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
deleted file mode 100644
index f1c9a0ab001e..000000000000
--- a/vllm/worker/worker_base.py
+++ /dev/null
@@ -1,643 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-import os
-import time
-from abc import abstractmethod
-from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
-
-import cloudpickle
-import torch
-import torch.nn as nn
-
-from vllm.config import (ObservabilityConfig, VllmConfig,
-                         set_current_vllm_config)
-from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import (enable_trace_function_call_for_thread,
-                        resolve_obj_by_qualname, run_method,
-                        update_environment_variables,
-                        warn_for_unimplemented_methods)
-from vllm.worker.model_runner_base import (BroadcastableModelInput,
-                                           ModelRunnerBase,
-                                           ModelRunnerInputBase)
-
-logger = init_logger(__name__)
-
-
-@warn_for_unimplemented_methods
-class WorkerBase:
-    """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware. Also abstracts control plane communication, e.g., to
-    communicate request metadata to other workers.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self.kv_transfer_config = vllm_config.kv_transfer_config
-        self.compilation_config = vllm_config.compilation_config
-        from vllm.platforms import current_platform
-        self.current_platform = current_platform
-
-    def init_device(self) -> None:
-        """Initialize device state, such as loading the model or other on-device
-        memory allocations.
-        """
-        raise NotImplementedError
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
-        """
-        raise NotImplementedError
-
-    def get_model(self) -> nn.Module:
-        raise NotImplementedError
-
-    def load_model(self) -> None:
-        """Load model onto target device."""
-        raise NotImplementedError
-
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
-        raise NotImplementedError
-
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        with self.current_platform.inference_mode():
-            while True:
-                output = self.execute_model(execute_model_req=None)
-                if output is None:
-                    return None
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        The implementation may run profiling or other heuristics to determine
-        the size of caches.
-
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        raise NotImplementedError
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size of a single cache block, in bytes. Used in
-        speculative decoding.
-        """
-        raise NotImplementedError
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError
-
-    @property
-    def vocab_size(self) -> int:
-        """Get vocabulary size from model configuration."""
-        return self.model_config.get_vocab_size()
-
-
-class DelegateWorkerBase(WorkerBase):
-    """
-    A class that delegates all methods to another WorkerBase instance. This is
-    useful for creating a WorkerBase that wraps another WorkerBase instance,
-    e.g. speculative decoding.
-    """
-    worker: WorkerBase
-
-    def __init__(
-        self,
-        *args,
-        **kwargs,
-    ) -> None:
-        vllm_config: VllmConfig = kwargs.get("vllm_config")
-        cls = resolve_obj_by_qualname(vllm_config.parallel_config.worker_cls)
-        self.worker = cls(*args, **kwargs)
-
-    def init_device(self) -> None:
-        self.worker.init_device()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        return self.worker.determine_num_available_blocks()
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
-
-    def load_model(self) -> None:
-        """Load model onto target device."""
-        self.worker.load_model()
-
-    def get_model(self) -> nn.Module:
-        return self.worker.get_model()
-
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
-        return self.worker.execute_model(execute_model_req)
-
-    def get_cache_block_size_bytes(self) -> int:
-        return self.worker.get_cache_block_size_bytes()
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.worker.add_lora(lora_request)
-
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.worker.remove_lora(lora_id)
-
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.worker.pin_lora(lora_id)
-
-    def list_loras(self) -> Set[int]:
-        return self.worker.list_loras()
-
-    def __getattr__(self, attr):
-        return getattr(self.worker, attr)
-
-
-class LoRANotSupportedWorkerBase(WorkerBase):
-    """Partial implementation of WorkerBase that raises exceptions when LoRA
-    methods are invoked.
-    """
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise ValueError(f"{type(self)} does not support LoRA")
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise ValueError(f"{type(self)} does not support LoRA")
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise ValueError(f"{type(self)} does not support LoRA")
-
-    def list_loras(self) -> Set[int]:
-        raise ValueError(f"{type(self)} does not support LoRA")
-
-
-@dataclasses.dataclass(frozen=True)
-class WorkerInput:
-    """Local inputs to each worker. May contain device-specific data. These
-    fields should be broadcastable to other workers.
-    """
-
-    num_seq_groups: Optional[int] = None
-    blocks_to_swap_in: Optional[torch.Tensor] = None
-    blocks_to_swap_out: Optional[torch.Tensor] = None
-    blocks_to_copy: Optional[torch.Tensor] = None
-    virtual_engine: int = 0
-    num_steps: int = 1
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type["WorkerInput"],
-        tensor_dict: Dict[str, Any],
-    ) -> "WorkerInput":
-        """
-        Pop fields from the given tensor_dict and populate a new instance of
-        WorkerInput.
-        """
-        return cls(
-            num_seq_groups=tensor_dict.pop("num_seq_groups"),
-            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
-            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
-            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
-            virtual_engine=tensor_dict["virtual_engine"],
-            num_steps=tensor_dict.pop("num_steps"),
-        )
-
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        """
-        Extract broadcastable fields.
-        """
-        tensor_dict = {
-            "num_seq_groups": self.num_seq_groups,
-            "blocks_to_swap_in": self.blocks_to_swap_in,
-            "blocks_to_swap_out": self.blocks_to_swap_out,
-            "blocks_to_copy": self.blocks_to_copy,
-            "virtual_engine": self.virtual_engine,
-            "num_steps": self.num_steps,
-        }
-
-        return tensor_dict
-
-
-class LocalOrDistributedWorkerBase(WorkerBase):
-    """
-    Partial implementation of WorkerBase that has a default `execute_model`
-    definition to perform metadata transfer between workers when in distributed
-    mode. Subclasses of this interface should use model runners that inherit
-    from ModelRunnerBase, and should only need to implement worker-local logic.
-    If custom control plane logic is needed to transfer metadata, or if the
-    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
-    """
-    is_driver_worker: bool
-    model_runner: ModelRunnerBase
-    observability_config: Optional[ObservabilityConfig] = None
-
-    @property
-    @abstractmethod
-    def do_metadata_broadcast(self) -> bool:
-        """
-        Used by the default `execute_model` to check whether broadcast is
-        needed to transfer request inputs from the driver worker to other
-        workers in the TP group. If WorkerBase subclass only supports
-        single-worker execution, then this method should return False.
-        """
-        raise NotImplementedError
-
-    @property
-    @abstractmethod
-    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        """
-        Gets the list of kv caches to pass to the worker's model runner. Each
-        element in the list is a kv cache corresponding to a particular virtual
-        engine (PP stream). Used by the default `execute_model`. If the worker's
-        model runner does not follow the ModelRunnerBase interface, then inherit
-        from WorkerBase instead.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def prepare_worker_input(
-            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        """
-        Prepare the inputs to WorkerBase.execute_worker from an execution
-        request. This method may move data to the worker's local device. It is
-        not allowed to communicate with other workers or devices.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def execute_worker(self, worker_input: WorkerInput) -> None:
-        """
-        Process an execution request.
-        """
-        raise NotImplementedError
-
-    def _get_worker_input_from_broadcast(
-        self
-    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
-            str, torch.Tensor]]]:
-        """ Get the worker input from the broadcasted tensor dict. """
-        assert self.do_metadata_broadcast
-        assert not self.is_driver_worker
-        broadcast_data = broadcast_tensor_dict(src=0)
-        if not broadcast_data:
-            return None
-
-        worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
-        model_input = (
-            self.model_runner.make_model_input_from_broadcasted_tensor_dict(
-                broadcast_data))
-
-        kwargs = extract_previous_hidden_states(broadcast_data)
-
-        return model_input, worker_input, kwargs
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """ Get the driver input and broadcast it to other workers.  """
-        assert self.is_driver_worker
-
-        worker_input: WorkerInput = self.prepare_worker_input(
-            execute_model_req=execute_model_req)
-        model_input: ModelRunnerInputBase = (
-            self.model_runner.prepare_model_input(
-                execute_model_req.seq_group_metadata_list,
-                execute_model_req.virtual_engine,
-                execute_model_req.finished_requests_ids))
-
-        kwargs = extract_previous_hidden_states(execute_model_req)
-
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_data.update(kwargs)
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        if execute_model_req.async_callback:
-            model_input = dataclasses.replace(  # type: ignore
-                model_input,
-                async_callback=execute_model_req.async_callback)
-
-        return model_input, worker_input, kwargs
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
-            str, torch.Tensor]]]:
-        """
-        Prepare the inputs to ModelRunner and workers.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-            return self._get_driver_input_and_broadcast(execute_model_req)
-        else:
-            return self._get_worker_input_from_broadcast()
-
-    def get_model(self) -> nn.Module:
-        return self.model_runner.get_model()
-
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[List[SamplerOutput]]:
-        """Executes at least one model step on the given sequences, unless no
-        sequences are provided."""
-        start_time = time.perf_counter()
-
-        inputs = self.prepare_input(execute_model_req)
-        if inputs is None:
-            return None
-
-        model_input, worker_input, kwargs = inputs
-        num_steps = worker_input.num_steps
-
-        self.execute_worker(worker_input)
-
-        # If there is no input, we don't need to execute the model.
-        if worker_input.num_seq_groups == 0:
-            return []
-
-        intermediate_tensors = None
-        orig_model_execute_time = 0.0
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict(
-                    all_gather_group=get_tp_group()))
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_execute_time):
-                orig_model_execute_time = intermediate_tensors.tensors.get(
-                    "model_execute_time", torch.tensor(0)).item()
-
-        output = self.model_runner.execute_model(
-            model_input=model_input,
-            kv_caches=self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None,
-            intermediate_tensors=intermediate_tensors,
-            num_steps=num_steps,
-            **kwargs,
-        )
-
-        model_execute_time = time.perf_counter() - start_time
-        if not get_pp_group().is_last_rank:
-            # output is IntermediateTensors
-            assert isinstance(output, IntermediateTensors)
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_execute_time):
-                output.tensors["model_execute_time"] = torch.tensor(
-                    model_execute_time + orig_model_execute_time)
-            get_pp_group().send_tensor_dict(output.tensors,
-                                            all_gather_group=get_tp_group())
-            return [None]
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_execute_time
-                and output is not None):
-            for o in output:
-                o.model_execute_time = (orig_model_execute_time +
-                                        model_execute_time)
-
-        # output is List[SamplerOutput]
-        return output
-
-    def _execute_model_spmd(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        intermediate_tensors: Optional[IntermediateTensors] = None
-    ) -> Optional[List[SamplerOutput]]:
-        """
-        Execute model in Single Program Multiple Data (SPMD) fashion.
-        All workers take the same request, prepare the input and
-        execute the model.
-        """
-        assert execute_model_req is not None, (
-            "_execute_model_spmd() requires each worker to take in an "
-            "ExecuteModelRequest")
-        worker_input: WorkerInput = self.prepare_worker_input(
-            execute_model_req=execute_model_req)
-        model_input: ModelRunnerInputBase = (
-            self.model_runner.prepare_model_input(
-                execute_model_req.seq_group_metadata_list))
-
-        self.execute_worker(worker_input)
-
-        # If there is no input, we don't need to execute the model.
-        if worker_input.num_seq_groups == 0:
-            return []
-
-        kwargs = extract_previous_hidden_states(execute_model_req)
-
-        return self.model_runner.execute_model(
-            model_input=model_input,
-            kv_caches=self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None,
-            intermediate_tensors=intermediate_tensors,
-            **kwargs,
-        )
-
-
-class WorkerWrapperBase:
-    """
-    This class represents one process in an executor/engine. It is responsible
-    for lazily initializing the worker and handling the worker's lifecycle.
-    We first instantiate the WorkerWrapper, which remembers the worker module
-    and class name. Then, when we call `update_environment_variables`, and the
-    real initialization happens in `init_worker`.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        rpc_rank: int = 0,
-    ) -> None:
-        """
-        Initialize the worker wrapper with the given vllm_config and rpc_rank.
-        Note: rpc_rank is the rank of the worker in the executor. In most cases,
-        it is also the rank of the worker in the distributed group. However,
-        when multiple executors work together, they can be different.
-        e.g. in the case of SPMD-style offline inference with TP=2,
-        users can launch 2 engines/executors, each with only 1 worker.
-        All workers have rpc_rank=0, but they have different ranks in the TP
-        group.
-        """
-        self.rpc_rank = rpc_rank
-        self.worker: Optional[WorkerBase] = None
-        self.vllm_config: Optional[VllmConfig] = None
-        # do not store this `vllm_config`, `init_worker` will set the final
-        # one. TODO: investigate if we can remove this field in
-        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
-        # unnecessary now.
-        if vllm_config.model_config is not None:
-            # it can be None in tests
-            trust_remote_code = vllm_config.model_config.trust_remote_code
-            if trust_remote_code:
-                # note: lazy import to avoid importing torch before initializing
-                from vllm.utils import init_cached_hf_modules
-                init_cached_hf_modules()
-
-    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
-        """
-        Adjust the rpc_rank based on the given mapping.
-        It is only used during the initialization of the executor,
-        to adjust the rpc_rank of workers after we create all workers.
-        """
-        if self.rpc_rank in rank_mapping:
-            self.rpc_rank = rank_mapping[self.rpc_rank]
-
-    def update_environment_variables(self, envs_list: List[Dict[str,
-                                                                str]]) -> None:
-        envs = envs_list[self.rpc_rank]
-        key = 'CUDA_VISIBLE_DEVICES'
-        if key in envs and key in os.environ:
-            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
-            # suppress the warning in `update_environment_variables`
-            del os.environ[key]
-        update_environment_variables(envs)
-
-    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
-        """
-        Here we inject some common logic before initializing the worker.
-        Arguments are passed to the worker class constructor.
-        """
-        kwargs = all_kwargs[self.rpc_rank]
-        self.vllm_config = kwargs.get("vllm_config", None)
-        assert self.vllm_config is not None, (
-            "vllm_config is required to initialize the worker")
-        enable_trace_function_call_for_thread(self.vllm_config)
-
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
-        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
-            worker_class = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_cls)
-        else:
-            logger.warning(
-                "passing worker_cls as a class object is strongly deprecated,"
-                " as the serialization of class objects can be tricky and"
-                " error-prone. To be safe, please keep the class in a separate"
-                " module and pass the qualified name of the class as a string."
-            )
-            assert isinstance(self.vllm_config.parallel_config.worker_cls,
-                              bytes)
-            worker_class = cloudpickle.loads(
-                self.vllm_config.parallel_config.worker_cls)
-        if self.vllm_config.parallel_config.worker_extension_cls:
-            worker_extension_cls = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_extension_cls)
-            extended_calls = []
-            if worker_extension_cls not in worker_class.__bases__:
-                # check any conflicts between worker and worker_extension_cls
-                for attr in dir(worker_extension_cls):
-                    if attr.startswith("__"):
-                        continue
-                    assert not hasattr(worker_class, attr), (
-                        f"Worker class {worker_class} already has an attribute"
-                        f" {attr}, which conflicts with the worker"
-                        f" extension class {worker_extension_cls}.")
-                    if callable(getattr(worker_extension_cls, attr)):
-                        extended_calls.append(attr)
-                # dynamically inherit the worker extension class
-                worker_class.__bases__ = worker_class.__bases__ + (
-                    worker_extension_cls, )
-                logger.info(
-                    "Injected %s into %s for extended collective_rpc calls %s",
-                    worker_extension_cls, worker_class, extended_calls)
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during worker initialization
-            self.worker = worker_class(**kwargs)
-            assert self.worker is not None
-
-    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
-        with set_current_vllm_config(self.vllm_config):
-            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
-
-    def init_device(self):
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during device initialization
-            self.worker.init_device()  # type: ignore
-
-    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
-        try:
-            # method resolution order:
-            # if a method is defined in this class, it will be called directly.
-            # otherwise, since we define `__getattr__` and redirect attribute
-            # query to `self.worker`, the method will be called on the worker.
-            return run_method(self, method, args, kwargs)
-        except Exception as e:
-            # if the driver worker also execute methods,
-            # exceptions in the rest worker may cause deadlock in rpc like ray
-            # see https://github.com/vllm-project/vllm/issues/3455
-            # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method!r}. "
-                   "This might cause deadlock in distributed execution.")
-            logger.exception(msg)
-            raise e
-
-    def __getattr__(self, attr):
-        return getattr(self.worker, attr)
-
-
-def extract_previous_hidden_states(
-        data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
-            Dict[str, torch.Tensor]:
-    """If data contains previous_hidden_states, extract it. This returns a dict
-    which can be used directly as additional kwargs in any following 
-    execute_model calls. This is used in draft models like EAGLE."""
-    output = {}
-
-    # When called from non-driver worker, data is dict but when called from
-    # driver worker, data is ExecuteModelRequest.
-    if isinstance(data, dict):
-        if "previous_hidden_states" in data:
-            output["previous_hidden_states"] = data["previous_hidden_states"]
-    elif data.previous_hidden_states is not None:
-        output["previous_hidden_states"] = data.previous_hidden_states\
-            .hidden_states
-
-    return output