vllm-project · comaniac · Sep 17, 2024 · May 29, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -274,6 +274,13 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
+
 - label: OpenAI-Compatible Tool Use # 20 min
   fast_check: false
   mirror_hardwares: [ amd ]

diff --git a/tests/encoder_decoder/__init__.py b/tests/encoder_decoder/__init__.py
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
@@ -0,0 +1,100 @@
+"""E2E tests to verify the correctness of the encoder-decoder framework
+
+Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
+"""
+from typing import List, Optional, Tuple
+
+from vllm.utils import is_cpu
+
+if not is_cpu():
+    # CPU backend is not currently supported with encoder/decoder models
+    # skip test definitions entirely to avoid importing GPU kernel libs
+    # (xFormers, etc.)
+
+    import pytest
+    from transformers import AutoModelForSeq2SeqLM
+
+    from vllm.sequence import SampleLogprobs
+
+    from ..conftest import DecoderPromptType
+    from ..models.utils import check_logprobs_close
+
+    def vllm_to_hf_output(
+        vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+        decoder_prompt_type: DecoderPromptType,
+    ):
+        """Sanitize vllm output to be comparable with hf output."""
+        output_ids, output_str, out_logprobs = vllm_output
+
+        hf_output_str = output_str + "</s>"
+        if decoder_prompt_type == DecoderPromptType.NONE:
+            hf_output_str = "<s>" + hf_output_str
+
+        return output_ids, hf_output_str, out_logprobs
+
+    @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+    @pytest.mark.parametrize("dtype", ["bfloat16"])
+    @pytest.mark.parametrize("max_tokens", [128])
+    @pytest.mark.parametrize("num_logprobs", [5])
+    @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType))
+    @pytest.mark.parametrize("enforce_eager", [True, False])
+    def test_encoder_decoder_e2e(
+        hf_runner,
+        vllm_runner,
+        example_encoder_decoder_prompts,
+        model: str,
+        dtype: str,
+        max_tokens: int,
+        num_logprobs: int,
+        decoder_prompt_type: DecoderPromptType,
+        enforce_eager: bool,
+    ) -> None:
+        '''
+        End-to-End (E2E) test for the encoder-decoder framework. 
+        This test evaluates the encoder-decoder functionality using the BART
+        model. We compare the outputs of the Hugging Face and vLLM
+        implementations to ensure that both implementations produce consistent
+        and correct results.
+        '''
+        test_case_prompts = example_encoder_decoder_prompts[
+            decoder_prompt_type]
+
+        # Configuration settings for HF baseline
+        hf_kwargs = {
+            "top_k": None,
+            "num_beams": 1,
+            "repetition_penalty": 1.0,
+            "top_p": 1.0,
+            "length_penalty": 1.0,
+            "early_stopping": False,
+            "no_repeat_ngram_size": None,
+            "min_length": 0
+        }
+
+        with hf_runner(model, dtype=dtype,
+                       auto_cls=AutoModelForSeq2SeqLM) as hf_model:
+            hf_outputs = (
+                hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                    test_case_prompts,
+                    max_tokens,
+                    num_logprobs,
+                    **hf_kwargs,
+                ))
+        with vllm_runner(model, dtype=dtype,
+                         enforce_eager=enforce_eager) as vllm_model:
+            vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
+                test_case_prompts, max_tokens, num_logprobs)
+
+        hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE
+                          else 0)
+
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, decoder_prompt_type)
+                for vllm_output in vllm_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            num_outputs_0_skip_tokens=hf_skip_tokens,
+        )
diff --git a/tests/models/test_bart.py b/tests/models/test_bart.py
@@ -140,17 +140,6 @@ def test_models(
                     num_logprobs,
                     **hf_kwargs,
                 ))
-
-        # Note: currently encoder/decoder models are only compatible with
-        # enforce_eager=True. Normally this is not a problem because
-        # for encoder/decoder models vLLM will
-        # default to enforce_eager=True if enforce_eager
-        # is left unspecified. However, the
-        # VllmRunner test fixture (which wraps around the LLM class) defaults to
-        # enforce_eager=False (a behavior which a number of already-exisitng
-        # decoder-only unit tests expect), so when testing an encoder/decoder
-        # model we must explicitly specify enforce_eager=True in the VllmRunner
-        # constructor.
         with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
             vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
                 test_case_prompts, max_tokens, num_logprobs)

diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,3 +1,4 @@
+import itertools
 from array import array
 from typing import List
 
@@ -7,13 +8,9 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams,
                            SequenceData, SequenceGroupMetadata)
-from vllm.utils import is_cpu
+from vllm.utils import is_cpu, make_tensor_with_pad
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-
-# CUDA graph scenarios to test
-#
-# Currently CUDA graph is not supported
-ENFORCE_EAGER = [True]
+from vllm.worker.model_runner import _get_graph_batch_size
 
 BATCH_SIZES = [1, 4, 16, 64, 256]
 
@@ -40,8 +37,7 @@ def _create_model_runner(model: str, *args,
                     reason="CPU backend is currently "
                     "unsupported for encoder/ "
                     "decoder models")
-@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
-def test_empty_seq_group(enforce_eager, ):
+def test_empty_seq_group():
     """Verify prepare prompt and decode returns empty output
        for empty seq group list"""
 
@@ -52,7 +48,7 @@ def test_empty_seq_group(enforce_eager, ):
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=False,
-        enforce_eager=enforce_eager,
+        enforce_eager=True,
     )
     seq_group_metadata_list: List[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input_tensors(
@@ -85,11 +81,7 @@ def test_empty_seq_group(enforce_eager, ):
                     "unsupported for encoder/ "
                     "decoder models")
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
-def test_prepare_prompt(
-    batch_size,
-    enforce_eager,
-):
+def test_prepare_prompt(batch_size):
     '''
     Test the ability of the encoder/decoder model runner subclass to
     produce prefill-phase model inputs & attention metadata.
@@ -115,7 +107,7 @@ def test_prepare_prompt(
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=False,
-        enforce_eager=enforce_eager,
+        enforce_eager=True,
     )
 
     seq_lens: List[int] = []
@@ -281,11 +273,7 @@ def test_prepare_prompt(
                     "unsupported for encoder/ "
                     "decoder models")
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
-@pytest.mark.parametrize("enforce_eager", ENFORCE_EAGER)
-def test_prepare_decode(
-    batch_size,
-    enforce_eager,
-):
+def test_prepare_decode(batch_size):
     '''
     Test the ability of the encoder/decoder model runner subclass to
     produce decode-phase model inputs & attention metadata.
@@ -311,7 +299,7 @@ def test_prepare_decode(
         max_num_batched_tokens=100000,
         max_num_seqs=100000,
         enable_chunked_prefill=False,
-        enforce_eager=enforce_eager,
+        enforce_eager=True,
     )
 
     seq_lens: List[int] = []
@@ -428,7 +416,8 @@ def test_prepare_decode(
         expected,
     )
 
-    # Cuda graph should is currently not supported for encoder/decoer.
+    # Model runner's CUDAGraph setting should be propagated to attention
+    # metadata.
     assert attn_metadata.use_cuda_graph is False
 
     # Verify the lengths of input tokens & positions
@@ -484,3 +473,152 @@ def test_prepare_decode(
         dtype=actual.dtype,
     )
     assert torch.equal(actual, expected)
+
+
+@pytest.mark.parametrize("batch_size", list(range(1, 257)))
+def test_prepare_decode_cuda_graph(batch_size):
+    """
+    Tests that for encoder-decoder models with CUDA Graph capture and replay
+    enabled, the tensors used during the decode phase are correctly padded 
+    for varying input batch sizes.
+    """
+    model_runner = _create_model_runner(
+        "facebook/bart-base",
+        seed=0,
+        dtype="float16",
+        max_num_batched_tokens=100000,
+        max_num_seqs=100000,
+        enable_chunked_prefill=False,
+        enforce_eager=False,
+    )
+
+    seq_lens: List[int] = []
+    encoder_seq_lens: List[int] = []
+    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    block_tables = {0: [1]}
+    cross_block_table = [2]
+    for i in range(batch_size):
+        # make sure all tokens fit into one block
+        seq_len = i % (model_runner.block_size - 1) + 1
+        seq_lens.append(seq_len)
+        seq_data = SequenceData(
+            array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(seq_len))))
+        encoder_seq_len = (i + 1) % (model_runner.block_size - 1) + 1
+        encoder_seq_lens.append(encoder_seq_len)
+        encoder_seq_data = SequenceData(
+            array(VLLM_TOKEN_ID_ARRAY_TYPE, (range(encoder_seq_len))))
+        seq_group_metadata = SequenceGroupMetadata(
+            request_id=f"test_{i}",
+            is_prompt=False,
+            seq_data={0: seq_data},
+            sampling_params=SamplingParams(temperature=0),
+            block_tables=block_tables,
+            encoder_seq_data=encoder_seq_data,
+            cross_block_table=cross_block_table,
+        )
+        assert seq_group_metadata.token_chunk_size == 1
+        seq_group_metadata_list.append(seq_group_metadata)
+
+    model_input = model_runner.prepare_model_input(seq_group_metadata_list)
+    input_tokens = model_input.input_tokens
+    input_positions = model_input.input_positions
+    attn_metadata = model_input.attn_metadata
+    return_seq_lens = model_input.seq_lens
+    slot_mapping = attn_metadata.slot_mapping
+    encoder_input_tokens = model_input.encoder_input_tokens
+    encoder_input_positions = model_input.encoder_input_positions
+    cross_slot_mapping = attn_metadata.cross_slot_mapping
+
+    # With CUDA Graph capture and replay enabled, the decoder and encoder
+    # input sequences will be padded. Create the expected padded tensors
+    # accordingly.
+    graph_batch_size = _get_graph_batch_size(batch_size)
+    cuda_graph_pad_size = graph_batch_size - batch_size
+    padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size))
+    padded_encoder_seq_lens = encoder_seq_lens + list(
+        itertools.repeat(1, cuda_graph_pad_size))
+
+    assert return_seq_lens == padded_seq_lens
+    assert len(slot_mapping) == len(input_tokens)
+    assert len(cross_slot_mapping) == len(encoder_input_tokens)
+
+    # Verify attention metadata
+    device = model_runner.device
+    assert attn_metadata.num_prefills == 0
+    assert attn_metadata.num_decode_tokens > 0
+    assert torch.equal(
+        attn_metadata.seq_lens_tensor,
+        torch.tensor(padded_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.seq_lens == padded_seq_lens
+    assert attn_metadata.max_prefill_seq_len == 0
+    assert attn_metadata.max_decode_seq_len == max(seq_lens)
+    # - Encoder attention metadata
+    assert attn_metadata.encoder_seq_lens == padded_encoder_seq_lens
+    assert torch.equal(
+        attn_metadata.encoder_seq_lens_tensor,
+        torch.tensor(padded_encoder_seq_lens, device=device, dtype=torch.int))
+    assert attn_metadata.max_encoder_seq_len == max(padded_encoder_seq_lens)
+    assert attn_metadata.num_encoder_tokens == sum(padded_encoder_seq_lens)
+
+    # Verify block tables are correct for prompts
+    # - Decoder self-attention. Pad the block tables as expected.
+    expected = [block_tables[0] for _ in range(batch_size)]
+    expected.extend([[] for _ in range(cuda_graph_pad_size)])
+    expected = make_tensor_with_pad(
+        expected,
+        max_len=64,
+        pad=0,
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
+    assert torch.equal(
+        attn_metadata.block_tables,
+        expected,
+    )
+    # - Encoder/decoder cross-attention. Pad the cross-attention block tables
+    # as expected.
+    expected = [cross_block_table for _ in range(len(seq_group_metadata_list))]
+    expected.extend([[] for _ in range(cuda_graph_pad_size)])
+    expected = make_tensor_with_pad(
+        expected,
+        max_len=64,
+        pad=0,
+        dtype=torch.int32,
+        device=model_runner.device,
+    )
+    assert torch.equal(
+        attn_metadata.cross_block_tables,
+        expected,
+    )
+
+    # Model runner's CUDAGraph setting should be propagated to attention
+    # metadata.
+    assert attn_metadata.use_cuda_graph is True
+
+    # Verify the lengths of input tokens & positions
+    # - Decoder
+    assert len(input_tokens) == len(padded_seq_lens)
+    assert len(input_positions) == len(padded_seq_lens)
+    # -- An indirect check that model_input.input_tokens
+    #    and model_input.input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        input_tokens,
+        input_positions,
+    )
+    # - Encoder
+    assert len(encoder_input_tokens) == 0
+    assert len(encoder_input_tokens) == 0
+    # -- An indirect check that model_input.encoder_input_tokens
+    #    and model_input.encoder_input_positions are correct -
+    #    by design of the test, the input tokens are
+    #    equal to the input position values, so if
+    #    the model_input data structure has the correct
+    #    values then these two should be equal
+    assert torch.equal(
+        encoder_input_tokens,
+        encoder_input_positions,
+    )