Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions tests/v1/worker/test_scheduler_tkv_limits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Tests for scheduler TKV limit enforcement.

This module tests the chunked prefill scheduler's ability to correctly
enforce the max_batch_tkv_limit constraint.
"""

import pytest
from scheduling_utils import create_request_for_scheduler_test, random_prompt
from v1.worker.mock_model import InstrumentedModelRunner
from spyre_util import REFERENCE_MODELS
from vllm_spyre.platform import SpyrePlatform


@pytest.mark.cpu
@pytest.mark.chunked_prefill
def test_scheduler_tkv_limits(monkeypatch: pytest.MonkeyPatch):
"""
Test that the scheduler correctly enforces the TKV limit constraint.

The test creates 5 requests with varying prompt lengths and a large
max_tokens value, then runs the scheduler to completion. With the bug
present, the scheduler incorrectly accepts a batch configuration that
violates the TKV limit of 131072.

Expected behavior (when bug is fixed):
- Scheduler should reject requests that would cause TKV limit violations
Comment thread
Daniel-Schenker marked this conversation as resolved.
Outdated
- Test should pass without exceeding hardware constraints

Current behavior (with bug):
- Scheduler accepts invalid batch configurations
- Test will fail with assertion errors
"""
# Setup: Use the default test model
model = REFERENCE_MODELS[InstrumentedModelRunner.DEFAULT_TEST_MODEL]

# Build model runner with specific constraints
model_runner = InstrumentedModelRunner.build(
monkeypatch=monkeypatch,
max_num_batched_tokens=512,
max_num_seqs=32,
max_model_len=32768,
available_blocks=32768,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see this having an impact here, but maybe set this to 8192 as this is the max number of blocks we currently support? 32K blocks (of 64 tokens) seems like a looot.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we did actually have available_blocks=8192 at first... But there was an error stating that 16385 blocks were needed. Maybe @joerunde can comment further on this.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, though we ended up with some startup errors because the max batch tkv limit wasn't being respected, which ended up crashing because we calculated a required 16k blocks 😢

We should probably understand why the env patch wasn't getting picked up, but that also doesn't need t block getting this in

)

# Configure the TKV limit
scheduler = model_runner.scheduler
scheduler.max_batch_tkv_limit = 131072
SpyrePlatform._max_batch_tkv_limit = 131072
monkeypatch.setenv("VLLM_DT_MAX_BATCH_TKV_LIMIT", "131072")

# Define prompt lengths for each request
# This combination is designed to trigger the TKV limit bug
prompt_lengths = [940, 412, 969, 949, 11946]
max_tokens = 16384

# Create and add all requests to the scheduler
requests = []
for request_id, prompt_length in enumerate(prompt_lengths):
prompt = random_prompt(model=model, seed=0, length=prompt_length)
request = create_request_for_scheduler_test(
model=model,
request_id=request_id,
add_step=0,
max_tokens=max_tokens,
prompt=prompt,
use_golden_token_injection=False,
generate_hf_results=False,
).request
requests.append(request)
scheduler.add_request(request)

# Run the scheduler loop until all requests complete
# With the bug present, the scheduler will incorrectly accept a batch
# configuration that exceeds the TKV limit
while True:
sched_output = scheduler.schedule()
output = model_runner.execute_model(sched_output)
scheduler.update_from_output(sched_output, output)
if len(scheduler.running) == 0:
break
8 changes: 4 additions & 4 deletions vllm_spyre/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,13 +505,13 @@ def check_batch_tkv_limit_cp(

# Compute token lengths for all running requests (decode batch)
decode_req_max_tkvs = []
# Decide new tkv based on max of current tkv or new request prompt tokens
dec_req_tkv = max(self.tkv, request.num_prompt_tokens)
for req in running:
# current tkv of the (left aligned) decode sequence
dec_req_tkv = (
n_blocks + 1
) * self.block_size + req.num_computed_tokens % self.block_size
n_generated_output_tokens = req.num_computed_tokens - req.num_prompt_tokens
dec_req_max_tkv = dec_req_tkv + (req.max_tokens - n_generated_output_tokens) - 1
dec_req_max_tkv += self.block_size # Account for potential padding block
# Do we need to round to next block size here?
Comment thread
Daniel-Schenker marked this conversation as resolved.
Outdated
decode_req_max_tkvs.append(dec_req_max_tkv)

# Sort decode requests token lengths in ascending order
Expand Down
Loading