From bf2edd22133ef88345366c3eac27297b8eecb90f Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 3 Sep 2024 13:16:13 -0700
Subject: [PATCH 1/7] refaactor fp8 tests

---
 .../basic_correctness/test_chunked_prefill.py |  98 +---------
 tests/models/test_fp8.py                      | 183 ++++++++----------
 tests/models/test_fp8kv_flashinfer.py         |  96 ---------
 3 files changed, 84 insertions(+), 293 deletions(-)
 delete mode 100644 tests/models/test_fp8kv_flashinfer.py

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index a63ac380e859..cf77b266d9c8 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -10,24 +10,12 @@
 
 import pytest
 
-from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..models.utils import check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-E5M2_KV_MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-chat-hf",
-]
-E4M3_KV_MODELS = [
-    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-]
-KV_CACHE_QUANTIZATION_PATHS = {
-    "meta-llama/Llama-2-7b-chat-hf":
-    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
-}
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -78,90 +66,6 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("kv_cache_dtype,model",
-                         [("fp8_e5m2", m)
-                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
-                                                      for m in E4M3_KV_MODELS])
-# Due to low-precision numerical divergence, we only test logprob of 4 tokens
-@pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
-@pytest.mark.parametrize("enforce_eager", [False, True])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
-# Due to low-precision numerical divergence, this test is too sensitive to
-# the async postprocessor
-@pytest.mark.parametrize("disable_async_output_proc", [True])
-def test_models_with_fp8_kv_cache(
-    vllm_runner,
-    example_prompts,
-    kv_cache_dtype: str,
-    model: str,
-    max_tokens: int,
-    chunked_prefill_token_size: int,
-    enforce_eager: bool,
-    tensor_parallel_size: int,
-    disable_async_output_proc: bool,
-) -> None:
-    """
-    Only checks log probs match between chunked-prefill and
-    non-chunked-prefill version of vLLM model runner.
-    
-    This test is used when there is discrepancy in kernels
-    / numerics (e.g. when using lower-precision types like FP8).
-    """
-    NUM_LOG_PROBS = 8
-
-    if model == "facebook/opt-125m":
-        pytest.skip(
-            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
-        )
-    if ((model, kv_cache_dtype, chunked_prefill_token_size) == (
-            "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)):
-        pytest.skip("flakey test, see: #7874 #8051")
-
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
-
-    extra_kwargs = {}
-    if model in KV_CACHE_QUANTIZATION_PATHS:
-        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
-            model]
-
-    with vllm_runner(
-            model,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
-    ) as vllm_model:
-        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            model,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
-    ) as vllm_model:
-        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=no_chunked_prefill_outputs,
-        outputs_1_lst=chunked_prefill_outputs,
-        name_0="no_chunked_prefill",
-        name_1="chunked_prefill",
-    )
-
-
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 4ab968c01da0..522e346c4827 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -3,116 +3,99 @@
 Note: these tests will only pass on L4 GPU.
 """
 import os
-from typing import List
+from typing import Optional
 
 import pytest
-import torch
-from transformers import AutoTokenizer
 
+from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
 
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-]
+from ..models.utils import check_logprobs_close
 
-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system made up of several basic components that work together to enable it to',
-            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
-        ]
-    },
-    "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "auto": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
-        ]
-    },
-}
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 
-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# Disabled to prevent it from breaking the build
-@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                enforce_eager=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model,scale_path",
+    [
+        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
+        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
+         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
+         "meta-llama/Llama-2-7b-chat-hf",
+         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    scale_path: Optional[str],
+    max_tokens: int,
+    enforce_eager: bool,
+    backend: str,
+    tensor_parallel_size: int,
+    disable_async_output_proc: bool,
+    monkeypatch,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+    override_backend_env_variable(monkeypatch, backend)
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            trust_remote_code=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
+    extra_kwargs = {}
+    if scale_path is not None:
+        extra_kwargs["quantization_param_path"] = scale_path
 
-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
+    with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            trust_remote_code=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+            **extra_kwargs,
+    ) as vllm_model:
+        test_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
 
-    print(model_name, kv_cache_dtype, generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="fp16_kv_cache",
+        name_1="fp8_kv_cache",
+    )
diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py
deleted file mode 100644
index ff2a44162b6c..000000000000
--- a/tests/models/test_fp8kv_flashinfer.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# flake8: noqa
-"""Tests fp8 models against ground truth generation
-This verifies the flashinfer backend with fp8 
-quantization and fp8 KV Cache without scaling 
-factors Note: these tests will only pass on H100 GPU.
-"""
-import os
-from typing import List
-
-import pytest
-from transformers import AutoTokenizer
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
-MAX_MODEL_LEN = 1024
-
-MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
-]
-
-EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": {
-        "auto": [
-            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-            'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o',
-        ],
-        "fp8": [
-            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
-            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
-            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
-        ]
-    }
-}
-
-
-# This test compares against golden strings for exact match since
-# there is no baseline implementation to compare against
-# and is unstable w.r.t specifics of the fp8 implementation or
-# the hardware being run on.
-# No assert to prevent it from breaking the build
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
-@pytest.mark.parametrize("model_name", MODELS)
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"])
-def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None:
-    # Note that the golden strings may not work for FLASHINFER Backend.
-    # The intention is to test the path
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
-    model = LLM(model=model_name,
-                max_model_len=MAX_MODEL_LEN,
-                trust_remote_code=True,
-                quantization="fp8",
-                kv_cache_dtype=kv_cache_dtype)
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for prompt in example_prompts
-    ]
-
-    params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
-    # Note: these need to be run 1 at a time due to numerical precision,
-    # since the expected strs were generated this way.
-    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
-        generations.append(outputs[0].outputs[0].text)
-    del model
-
-    print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}")
-    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
-    for i in range(len(example_prompts)):
-        generated_str = generations[i]
-        expected_str = expected_strs[i]
-        print(f"generated_str\n: {generated_str}")
-        print(f"expected_str\n: {expected_str}")

From f23d840e17414107398a4d9ef65000daf0b98b14 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 3 Sep 2024 13:25:24 -0700
Subject: [PATCH 2/7] add chunked prefill test back

---
 .../basic_correctness/test_chunked_prefill.py | 105 +++++++++---------
 1 file changed, 50 insertions(+), 55 deletions(-)

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index cf77b266d9c8..58cd20efa51e 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,11 +6,10 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-from contextlib import nullcontext
 
 import pytest
 
-from ..models.utils import check_outputs_equal
+from ..models.utils import check_logprobs_close, check_outputs_equal
 
 MODELS = [
     "facebook/opt-125m",
@@ -66,66 +65,62 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("max_tokens", [16])
-@pytest.mark.parametrize("enforce_eager", [False])
-@pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("use_v2_block_manager", [False, True])
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
+@pytest.mark.parametrize("enforce_eager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-def test_with_prefix_caching(
+def test_models_with_fp8_kv_cache(
     vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    model: str,
     max_tokens: int,
+    chunked_prefill_token_size: int,
     enforce_eager: bool,
-    chunk_size: int,
-    use_v2_block_manager: bool,
     tensor_parallel_size: int,
 ) -> None:
     """
-    Checks exact match decode with and without prefix caching
-    with chunked prefill enabled.
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
     """
-    model = "meta-llama/Llama-2-7b-chat-hf"
-    # The common prompt has 142 tokens with Llama-2 tokenizer.
-    common_prompt = "You are a helpful AI assistant " * 20
-    unique_prompts = [
-        "Question",  # Warmup
-        "Question",  # Fully cached
-        "Another question",  # Partial cached
-    ]
-    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
-
-    max_num_batched_tokens = max_num_seqs = chunk_size
-    outputs = {}  # type: ignore
-    check_result = True
-    for enable in (True, False):
-        with vllm_runner(
-                model,
-                dtype="half",
-                max_num_batched_tokens=max_num_batched_tokens,
-                enable_chunked_prefill=True,
-                enable_prefix_caching=enable,
-                tensor_parallel_size=tensor_parallel_size,
-                use_v2_block_manager=use_v2_block_manager,
-                enforce_eager=enforce_eager,
-                max_num_seqs=max_num_seqs,
-        ) as vllm_model:
-            # It should fail when prefix caching is enable and chunk
-            # size is not a multiple of block size (16).
-            should_fail = chunk_size % 16 != 0 and enable
-            check_result &= not should_fail
-            outputs[enable] = []
-            # Send the request one-by-one to ensure the cache is populated.
-            with pytest.raises(ValueError) if should_fail else nullcontext():
-                for prompt in full_prompts:
-                    outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                                  max_tokens)
-
-    # Check results only if we did not expect a failure.
-    if check_result:
-        check_outputs_equal(
-            outputs_0_lst=outputs[False],
-            outputs_1_lst=outputs[True],
-            name_0="w/o prefix caching",
-            name_1="with prefix caching",
-        )
+    NUM_LOG_PROBS = 8
+
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    with vllm_runner(
+            model,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+    ) as vllm_model:
+        no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            model,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=True,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+            kv_cache_dtype=kv_cache_dtype,
+    ) as vllm_model:
+        chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=no_chunked_prefill_outputs,
+        outputs_1_lst=chunked_prefill_outputs,
+        name_0="no_chunked_prefill",
+        name_1="chunked_prefill",
+    )

From c73e0275b0e5cd1e4bd35c9f0cff525a4a1a77f3 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 3 Sep 2024 13:27:06 -0700
Subject: [PATCH 3/7] revert

---
 .../basic_correctness/test_chunked_prefill.py | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 58cd20efa51e..2cd089c309f7 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,6 +6,7 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+from contextlib import nullcontext
 
 import pytest
 
@@ -76,6 +77,9 @@ def test_models(
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
+# Due to low-precision numerical divergence, this test is too sensitive to
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
     vllm_runner,
     example_prompts,
@@ -85,6 +89,7 @@ def test_models_with_fp8_kv_cache(
     chunked_prefill_token_size: int,
     enforce_eager: bool,
     tensor_parallel_size: int,
+    disable_async_output_proc: bool,
 ) -> None:
     """
     Check output logprobs match between no_chunked_prefill and chunked_prefill
@@ -102,6 +107,7 @@ def test_models_with_fp8_kv_cache(
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -114,6 +120,7 @@ def test_models_with_fp8_kv_cache(
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         chunked_prefill_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -124,3 +131,68 @@ def test_models_with_fp8_kv_cache(
         name_0="no_chunked_prefill",
         name_1="chunked_prefill",
     )
+
+
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("chunk_size", [30, 32])
+@pytest.mark.parametrize("use_v2_block_manager", [False, True])
+# NOTE: Increasing this in this suite will fail CI because we currently cannot
+# reset distributed env properly. Use a value > 1 just when you test.
+@pytest.mark.parametrize("tensor_parallel_size", [1])
+def test_with_prefix_caching(
+    vllm_runner,
+    max_tokens: int,
+    enforce_eager: bool,
+    chunk_size: int,
+    use_v2_block_manager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    """	
+    Checks exact match decode with and without prefix caching	
+    with chunked prefill enabled.	
+    """
+    model = "meta-llama/Llama-2-7b-chat-hf"
+    # The common prompt has 142 tokens with Llama-2 tokenizer.
+    common_prompt = "You are a helpful AI assistant " * 20
+    unique_prompts = [
+        "Question",  # Warmup	
+        "Question",  # Fully cached	
+        "Another question",  # Partial cached	
+    ]
+    full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
+
+    max_num_batched_tokens = max_num_seqs = chunk_size
+    outputs = {}  # type: ignore
+    check_result = True
+    for enable in (True, False):
+        with vllm_runner(
+                model,
+                dtype="half",
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=enable,
+                tensor_parallel_size=tensor_parallel_size,
+                use_v2_block_manager=use_v2_block_manager,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            # It should fail when prefix caching is enable and chunk
+            # size is not a multiple of block size (16).
+            should_fail = chunk_size % 16 != 0 and enable
+            check_result &= not should_fail
+            outputs[enable] = []
+            # Send the request one-by-one to ensure the cache is populated.
+            with pytest.raises(ValueError) if should_fail else nullcontext():
+                for prompt in full_prompts:
+                    outputs[enable] += vllm_model.generate_greedy([prompt],
+                                                                  max_tokens)
+
+    # Check results only if we did not expect a failure.
+    if check_result:
+        check_outputs_equal(
+            outputs_0_lst=outputs[False],
+            outputs_1_lst=outputs[True],
+            name_0="w/o prefix caching",
+            name_1="with prefix caching",
+        )

From 210d140789eea2e536b8ce98493e1b77279c3fa2 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 3 Sep 2024 14:08:24 -0700
Subject: [PATCH 4/7] format

---
 tests/basic_correctness/test_chunked_prefill.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 2cd089c309f7..9c34b2a13fd5 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -148,17 +148,17 @@ def test_with_prefix_caching(
     use_v2_block_manager: bool,
     tensor_parallel_size: int,
 ) -> None:
-    """	
-    Checks exact match decode with and without prefix caching	
-    with chunked prefill enabled.	
+    """
+    Checks exact match decode with and without prefix caching
+    with chunked prefill enabled.
     """
     model = "meta-llama/Llama-2-7b-chat-hf"
     # The common prompt has 142 tokens with Llama-2 tokenizer.
     common_prompt = "You are a helpful AI assistant " * 20
     unique_prompts = [
-        "Question",  # Warmup	
-        "Question",  # Fully cached	
-        "Another question",  # Partial cached	
+        "Question",  # Warmup
+        "Question",  # Fully cached
+        "Another question",  # Partial cached
     ]
     full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts]
 

From a0017965d701aca69042fb8f9c316c0da2aa8ecf Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 4 Sep 2024 09:36:40 -0700
Subject: [PATCH 5/7] Update tests/models/test_fp8.py

---
 tests/models/test_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 522e346c4827..6c457408ca52 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -67,7 +67,6 @@ def test_models(
     with vllm_runner(
             base_model,
             max_model_len=MAX_MODEL_LEN,
-            trust_remote_code=True,
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype="auto",

From 67f32a70b86dee35ecc095c45209f555796b21ba Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 4 Sep 2024 09:36:44 -0700
Subject: [PATCH 6/7] Update tests/models/test_fp8.py

---
 tests/models/test_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 6c457408ca52..17acdb52322f 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -82,7 +82,6 @@ def test_models(
     with vllm_runner(
             test_model,
             max_model_len=MAX_MODEL_LEN,
-            trust_remote_code=True,
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             kv_cache_dtype=kv_cache_dtype,

From 911ad12eaf9591aa4159bf65bf6c5e941b29c8da Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 4 Sep 2024 09:45:59 -0700
Subject: [PATCH 7/7] disable test on CPU

---
 .buildkite/run-cpu-test.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 8e4be08f3aba..ca9cf15780e2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -23,7 +23,12 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "
   pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
+      --ignore=tests/models/test_oot_registration.py \
+      --ignore=tests/models/test_registry.py \
+      --ignore=tests/models/test_fp8.py \
+      --ignore=tests/models/test_jamba.py \
+      --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference
 docker exec cpu-test bash -c "