alhridoy
diff --git a/‎tests/v1/logits_processors/test_custom_offline.py‎
Lines changed: 38 additions & 21 deletions b/‎tests/v1/logits_processors/test_custom_offline.py‎
Lines changed: 38 additions & 21 deletions
diff --git a/‎tests/v1/sample/test_rejection_sampler.py‎
Lines changed: 171 additions & 9 deletions b/‎tests/v1/sample/test_rejection_sampler.py‎
Lines changed: 171 additions & 9 deletions
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import random
 import sys
-from typing import Union
+from typing import Any, Union
 
 import pytest
 
@@ -25,6 +25,7 @@
 from vllm import LLM, SamplingParams
 from vllm.v1.sample.logits_processor import (
     STR_POOLING_REJECTS_LOGITSPROCS,
+    STR_SPEC_DEC_REJECTS_LOGITSPROCS,
     LogitsProcessor,
 )
 
@@ -205,6 +206,7 @@ def test_custom_logitsprocs_req(monkeypatch):
 
 
 @create_new_process_for_each_test()
+@pytest.mark.parametrize("model_scenario", ["pooling", "spec_dec"])
 @pytest.mark.parametrize(
     "logitproc_source",
     [
@@ -213,11 +215,12 @@ def test_custom_logitsprocs_req(monkeypatch):
         CustomLogitprocSource.LOGITPROC_SOURCE_CLASS,
     ],
 )
-def test_pooling_rejects_custom_logitsprocs(
-    monkeypatch, logitproc_source: CustomLogitprocSource
+def test_rejects_custom_logitsprocs(
+    monkeypatch, model_scenario: str, logitproc_source: CustomLogitprocSource
 ):
     """Validate that vLLM engine initialization properly rejects custom
-    logitsprocs when the model is a pooling model.
+    logitsprocs when the model is a pooling model or speculative decoding
+    enabled.
 
     Use `LLM` entrypoint. We expect `LLM` initialization to fail before the
     logitproc is actually loaded.
@@ -241,8 +244,32 @@ def test_pooling_rejects_custom_logitsprocs(
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     random.seed(40)
 
+    test_params: dict[str, dict[str, Any]] = {
+        "pooling": {
+            "runner": "pooling",
+            "model": POOLING_MODEL_NAME,
+            "error_message": STR_POOLING_REJECTS_LOGITSPROCS,
+            "speculative_config": None,
+        },
+        "spec_dec": {
+            "runner": "auto",
+            "model": MODEL_NAME,
+            "error_message": STR_SPEC_DEC_REJECTS_LOGITSPROCS,
+            "speculative_config": {"model": "ngram", "num_speculative_tokens": 1},
+        },
+    }
+
+    config = test_params[model_scenario]
+
+    llm_kwargs: dict[str, Any] = {
+        "runner": config["runner"],
+        "model": config["model"],
+        "gpu_memory_utilization": 0.1,
+        "speculative_config": config["speculative_config"],
+    }
+
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT:
-        # Scenario: vLLM loads a pooling model and ignores a logitproc that is
+        # Scenario: vLLM loads a model and ignores a logitproc that is
         # available at a preconfigured entrypoint
 
         # Patch in dummy logitproc entrypoint
@@ -254,30 +281,20 @@ def test_pooling_rejects_custom_logitsprocs(
         # although they should ignore the entrypoint patch anyway
         monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "fork")
 
-        llm = LLM(
-            runner="pooling",
-            model=POOLING_MODEL_NAME,
-            gpu_memory_utilization=0.1,
-        )
+        llm = LLM(**llm_kwargs)
         # Require that no logitsprocs have been loaded
         worker = llm.llm_engine.model_executor.driver_worker.worker
         assert sum([1 for _ in worker.model_runner.input_batch.logitsprocs.all]) == 0
         return
 
-    kwargs: dict[str, list[Union[str, type[LogitsProcessor]]]] = {}
     if logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_FQCN:
         # Scenario: load logitproc based on fully-qualified class name (FQCN)
-        kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
+        llm_kwargs["logits_processors"] = [DUMMY_LOGITPROC_FQCN]
     elif logitproc_source == CustomLogitprocSource.LOGITPROC_SOURCE_CLASS:
         # Scenario: load logitproc from provided class object
-        kwargs["logits_processors"] = [DummyLogitsProcessor]
+        llm_kwargs["logits_processors"] = [DummyLogitsProcessor]
 
-    with pytest.raises(ValueError, match=STR_POOLING_REJECTS_LOGITSPROCS):
-        # Require that loading a pooling model alongside the logitproc raises
+    with pytest.raises(ValueError, match=config["error_message"]):
+        # Require that loading a model alongside the logitproc raises
         # the appropriate exception.
-        LLM(
-            runner="pooling",
-            model=POOLING_MODEL_NAME,
-            gpu_memory_utilization=0.1,
-            **kwargs,
-        )
+        LLM(**llm_kwargs)
@@ -6,6 +6,7 @@
 import torch
 import torch.nn.functional as F
 
+from tests.v1.sample.utils import create_allowed_token_ids
 from vllm.platforms import current_platform
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -21,7 +22,9 @@ def rejection_sampler():
 
 
 def create_logits_tensor(
-    output_token_ids: list[list[int]], vocab_size: int = 100
+    output_token_ids: list[list[int]],
+    vocab_size: int = 100,
+    token_idx_to_override: Optional[int] = None,
 ) -> torch.Tensor:
     """Helper function to create logits tensor that
     will produce desired token ids on argmax"""
@@ -33,15 +36,25 @@ def create_logits_tensor(
         for j, token_id in enumerate(tokens):
             logits[start_loc + j, token_id] = 100.0
         start_loc += len(tokens)
+    if token_idx_to_override:
+        logits[:, token_idx_to_override] = 99.0
     return logits
 
 
 def create_sampling_metadata(
     all_greedy: bool,
+    output_token_ids: Optional[list[list[int]]] = None,
+    prompt_token_ids: Optional[torch.Tensor] = None,
+    spec_token_ids: Optional[torch.Tensor] = None,
     temperature: Optional[torch.Tensor] = None,
     top_k: Optional[torch.Tensor] = None,
     top_p: Optional[torch.Tensor] = None,
     generators: Optional[dict[int, Any]] = None,
+    frequency_penalties: Optional[list[float]] = None,
+    presence_penalties: Optional[list[float]] = None,
+    repetition_penalties: Optional[list[float]] = None,
+    bad_words_token_ids: Optional[dict[int, list[list[int]]]] = None,
+    allowed_token_ids_mask: Optional[torch.Tensor] = None,
 ) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set
     to the given value. Either all greedy or all random sampling
@@ -53,6 +66,21 @@ def create_sampling_metadata(
     else:
         assert temperature is not None
 
+    if any([frequency_penalties, presence_penalties, repetition_penalties]):
+        no_penalties = False
+
+        assert output_token_ids
+        assert len(output_token_ids) > 0
+
+        frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE)
+        presence_penalties = torch.tensor(presence_penalties, device=DEVICE)
+        repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE)
+    else:
+        no_penalties = True
+        frequency_penalties = torch.tensor([])
+        presence_penalties = torch.tensor([])
+        repetition_penalties = torch.tensor([])
+
     return SamplingMetadata(
         temperature=temperature,
         all_greedy=all_greedy,
@@ -61,14 +89,15 @@ def create_sampling_metadata(
         top_k=top_k,
         generators=generators,
         max_num_logprobs=0,
-        no_penalties=False,
-        prompt_token_ids=None,
-        frequency_penalties=torch.tensor([]),
-        presence_penalties=torch.tensor([]),
-        repetition_penalties=torch.tensor([]),
-        output_token_ids=[],
-        allowed_token_ids_mask=None,
-        bad_words_token_ids={},
+        no_penalties=no_penalties,
+        prompt_token_ids=prompt_token_ids,
+        frequency_penalties=frequency_penalties,
+        presence_penalties=presence_penalties,
+        repetition_penalties=repetition_penalties,
+        output_token_ids=[] if output_token_ids is None else output_token_ids,
+        spec_token_ids=[] if spec_token_ids is None else spec_token_ids,
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids={} if bad_words_token_ids is None else bad_words_token_ids,
         logitsprocs=LogitsProcessors(),
     )
 
@@ -611,3 +640,136 @@ def test_top_p(rejection_sampler, top_p):
         unmasked_indices=top_p_indices,
         sampling_metadata=sampling_metadata,
     )
+
+
+########################### Tests for Logit Processors ###################
+def test_frequency_penalties(rejection_sampler):
+    """Test rejection sampling with frequency penalties"""
+    spec_tokens = [[1, 1, 1], [], [1, 1, 1]]
+    output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]]  # 1, 7 and 1 are the bonus tokens
+
+    num_requsts = len(spec_tokens)
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
+        frequency_penalties=[1.5, 1.5, 0.7],
+        presence_penalties=[0.0] * num_requsts,
+        repetition_penalties=[1.0] * num_requsts,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [7, -1, -1, -1], [1, 1, 15, -1]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output, expected)
+
+
+def test_bad_words(rejection_sampler):
+    """Test rejection sampling with bad words constraints"""
+    spec_tokens = [[1, 2, 3], [1, 15, 3], [1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
+
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[2], [3], [4]],
+        spec_token_ids=spec_tokens,
+        bad_words_token_ids={
+            0: [
+                [
+                    2,
+                ]
+            ],
+            1: [
+                [
+                    2,
+                ]
+            ],
+            # Do not apply bad words to the last request
+        },
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[1, 15, -1, -1], [1, 15, 3, 4], [1, 2, 3, 4]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output, expected)
+
+
+def test_allowed_token_ids(rejection_sampler):
+    """Test rejection sampling with allowed token ids"""
+    spec_tokens = [[1, 2, 10], [10, 5, 3], [7, 10, 12]]
+    output_tokens = [[1, 2, 10, 5], [10, 5, 10, 5], [7, 10, 12, 5]]
+    # Not allowed tokens:
+    # 0: 0-4
+    # 1: 1-5
+    # 2: 2-6
+    num_allowed_token_ids = 5
+
+    # Use the token 15 as the sampler choose if a token rejected
+    logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
+
+    batch_size = len(output_tokens)
+    _, vocab_size = logits.size()
+    mask = create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=vocab_size,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=logits.device,
+    )
+    metadata = create_sampling_metadata(
+        all_greedy=True,
+        output_token_ids=[[], [], []],
+        spec_token_ids=spec_tokens,
+        allowed_token_ids_mask=mask,
+    )
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
+    )
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        spec_tokens, device=logits.device
+    )
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+
+    expected = torch.tensor(
+        [[15, -1, -1, -1], [10, 5, 10, -1], [7, 10, 12, 5]],
+        dtype=torch.int,
+        device=logits.device,
+    )
+    assert torch.equal(output, expected)