From bf2edd22133ef88345366c3eac27297b8eecb90f Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 3 Sep 2024 13:16:13 -0700 Subject: [PATCH 1/7] refaactor fp8 tests --- .../basic_correctness/test_chunked_prefill.py | 98 +--------- tests/models/test_fp8.py | 183 ++++++++---------- tests/models/test_fp8kv_flashinfer.py | 96 --------- 3 files changed, 84 insertions(+), 293 deletions(-) delete mode 100644 tests/models/test_fp8kv_flashinfer.py diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index a63ac380e859..cf77b266d9c8 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -10,24 +10,12 @@ import pytest -from ..models.utils import check_logprobs_close, check_outputs_equal +from ..models.utils import check_outputs_equal MODELS = [ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", ] -E5M2_KV_MODELS = [ - "facebook/opt-125m", - "meta-llama/Llama-2-7b-chat-hf", -] -E4M3_KV_MODELS = [ - "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" -] -KV_CACHE_QUANTIZATION_PATHS = { - "meta-llama/Llama-2-7b-chat-hf": - "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" -} @pytest.mark.parametrize("model", MODELS) @@ -78,90 +66,6 @@ def test_models( ) -@pytest.mark.parametrize("kv_cache_dtype,model", - [("fp8_e5m2", m) - for m in E5M2_KV_MODELS] + [("fp8_e4m3", m) - for m in E4M3_KV_MODELS]) -# Due to low-precision numerical divergence, we only test logprob of 4 tokens -@pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16]) -@pytest.mark.parametrize("enforce_eager", [False, True]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) -# Due to low-precision numerical divergence, this test is too sensitive to -# the async postprocessor -@pytest.mark.parametrize("disable_async_output_proc", [True]) -def test_models_with_fp8_kv_cache( - vllm_runner, - example_prompts, - kv_cache_dtype: str, - model: str, - max_tokens: int, - chunked_prefill_token_size: int, - enforce_eager: bool, - tensor_parallel_size: int, - disable_async_output_proc: bool, -) -> None: - """ - Only checks log probs match between chunked-prefill and - non-chunked-prefill version of vLLM model runner. - - This test is used when there is discrepancy in kernels - / numerics (e.g. when using lower-precision types like FP8). - """ - NUM_LOG_PROBS = 8 - - if model == "facebook/opt-125m": - pytest.skip( - "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m" - ) - if ((model, kv_cache_dtype, chunked_prefill_token_size) == ( - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)): - pytest.skip("flakey test, see: #7874 #8051") - - max_num_seqs = chunked_prefill_token_size - max_num_batched_tokens = chunked_prefill_token_size - - extra_kwargs = {} - if model in KV_CACHE_QUANTIZATION_PATHS: - extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[ - model] - - with vllm_runner( - model, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, - ) as vllm_model: - no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner( - model, - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - tensor_parallel_size=tensor_parallel_size, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - kv_cache_dtype=kv_cache_dtype, - disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, - ) as vllm_model: - chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=no_chunked_prefill_outputs, - outputs_1_lst=chunked_prefill_outputs, - name_0="no_chunked_prefill", - name_1="chunked_prefill", - ) - - @pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("chunk_size", [30, 32]) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 4ab968c01da0..522e346c4827 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,116 +3,99 @@ Note: these tests will only pass on L4 GPU. """ import os -from typing import List +from typing import Optional import pytest -import torch -from transformers import AutoTokenizer +from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", - "meta-llama/Meta-Llama-3-8B-Instruct", -] +from ..models.utils import check_logprobs_close -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system made up of several basic components that work together to enable it to', - 'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk' - ] - }, - "meta-llama/Meta-Llama-3-8B-Instruct": { - "auto": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu' - ] - }, -} +os.environ["TOKENIZERS_PARALLELISM"] = "true" -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# Disabled to prevent it from breaking the build -@pytest.mark.skip( - reason= - "Prevent unstable test based on golden strings from breaking the build.") @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_models(example_prompts, model_name, kv_cache_dtype) -> None: - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - enforce_eager=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) +@pytest.mark.parametrize( + "kv_cache_dtype,base_model,test_model,scale_path", + [ + # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. + ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + # Test FP16 checkpoint w. fp8_e5m2 kv-cache. + ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", None), + # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. + ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-7b-chat-hf", + "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + ]) +# Due to low-precision numerical divergence, we only test logprob of 4 tokens +@pytest.mark.parametrize("max_tokens", [4]) +@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) +# NOTE: Increasing this in this suite will fail CI because we currently cannot +# reset distributed env properly. Use a value > 1 just when you test. +@pytest.mark.parametrize("tensor_parallel_size", [1]) +# Due to low-precision numerical divergence, this test is too sensitive for +# the async postprocessor +@pytest.mark.parametrize("disable_async_output_proc", [True]) +def test_models( + vllm_runner, + example_prompts, + kv_cache_dtype: str, + base_model: str, + test_model: str, + scale_path: Optional[str], + max_tokens: int, + enforce_eager: bool, + backend: str, + tensor_parallel_size: int, + disable_async_output_proc: bool, + monkeypatch, +) -> None: + """ + Only checks log probs match to cover the discrepancy in + numerical sensitive kernels. + """ + override_backend_env_variable(monkeypatch, backend) + + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 + + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + trust_remote_code=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] + extra_kwargs = {} + if scale_path is not None: + extra_kwargs["quantization_param_path"] = scale_path - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + trust_remote_code=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + **extra_kwargs, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - print(model_name, kv_cache_dtype, generations) - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - assert expected_str == generated_str, ( - f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}") + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py deleted file mode 100644 index ff2a44162b6c..000000000000 --- a/tests/models/test_fp8kv_flashinfer.py +++ /dev/null @@ -1,96 +0,0 @@ -# flake8: noqa -"""Tests fp8 models against ground truth generation -This verifies the flashinfer backend with fp8 -quantization and fp8 KV Cache without scaling -factors Note: these tests will only pass on H100 GPU. -""" -import os -from typing import List - -import pytest -from transformers import AutoTokenizer - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8", -] - -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o', - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o', - ] - } -} - - -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# No assert to prevent it from breaking the build -@pytest.mark.skipif(not is_quant_method_supported("fp8"), - reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"]) -def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None: - # Note that the golden strings may not work for FLASHINFER Backend. - # The intention is to test the path - os.environ["VLLM_ATTENTION_BACKEND"] = backend - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] - - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model - - print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}") - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - print(f"generated_str\n: {generated_str}") - print(f"expected_str\n: {expected_str}") From f23d840e17414107398a4d9ef65000daf0b98b14 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 3 Sep 2024 13:25:24 -0700 Subject: [PATCH 2/7] add chunked prefill test back --- .../basic_correctness/test_chunked_prefill.py | 105 +++++++++--------- 1 file changed, 50 insertions(+), 55 deletions(-) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index cf77b266d9c8..58cd20efa51e 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -6,11 +6,10 @@ Run `pytest tests/models/test_chunked_prefill.py`. """ -from contextlib import nullcontext import pytest -from ..models.utils import check_outputs_equal +from ..models.utils import check_logprobs_close, check_outputs_equal MODELS = [ "facebook/opt-125m", @@ -66,66 +65,62 @@ def test_models( ) -@pytest.mark.parametrize("max_tokens", [16]) -@pytest.mark.parametrize("enforce_eager", [False]) -@pytest.mark.parametrize("chunk_size", [30, 32]) -@pytest.mark.parametrize("use_v2_block_manager", [False, True]) +@pytest.mark.parametrize( + "kv_cache_dtype,model", + [("fp8_e4m3", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")]) +# Due to low-precision numerical divergence, we only test logprob of 4 tokens +@pytest.mark.parametrize("max_tokens", [4]) +@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16]) +@pytest.mark.parametrize("enforce_eager", [False, True]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) -def test_with_prefix_caching( +def test_models_with_fp8_kv_cache( vllm_runner, + example_prompts, + kv_cache_dtype: str, + model: str, max_tokens: int, + chunked_prefill_token_size: int, enforce_eager: bool, - chunk_size: int, - use_v2_block_manager: bool, tensor_parallel_size: int, ) -> None: """ - Checks exact match decode with and without prefix caching - with chunked prefill enabled. + Check output logprobs match between no_chunked_prefill and chunked_prefill + with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py, + so here we only check chunked prefill. """ - model = "meta-llama/Llama-2-7b-chat-hf" - # The common prompt has 142 tokens with Llama-2 tokenizer. - common_prompt = "You are a helpful AI assistant " * 20 - unique_prompts = [ - "Question", # Warmup - "Question", # Fully cached - "Another question", # Partial cached - ] - full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts] - - max_num_batched_tokens = max_num_seqs = chunk_size - outputs = {} # type: ignore - check_result = True - for enable in (True, False): - with vllm_runner( - model, - dtype="half", - max_num_batched_tokens=max_num_batched_tokens, - enable_chunked_prefill=True, - enable_prefix_caching=enable, - tensor_parallel_size=tensor_parallel_size, - use_v2_block_manager=use_v2_block_manager, - enforce_eager=enforce_eager, - max_num_seqs=max_num_seqs, - ) as vllm_model: - # It should fail when prefix caching is enable and chunk - # size is not a multiple of block size (16). - should_fail = chunk_size % 16 != 0 and enable - check_result &= not should_fail - outputs[enable] = [] - # Send the request one-by-one to ensure the cache is populated. - with pytest.raises(ValueError) if should_fail else nullcontext(): - for prompt in full_prompts: - outputs[enable] += vllm_model.generate_greedy([prompt], - max_tokens) - - # Check results only if we did not expect a failure. - if check_result: - check_outputs_equal( - outputs_0_lst=outputs[False], - outputs_1_lst=outputs[True], - name_0="w/o prefix caching", - name_1="with prefix caching", - ) + NUM_LOG_PROBS = 8 + + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size + + with vllm_runner( + model, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + kv_cache_dtype=kv_cache_dtype, + ) as vllm_model: + no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner( + model, + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + kv_cache_dtype=kv_cache_dtype, + ) as vllm_model: + chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=no_chunked_prefill_outputs, + outputs_1_lst=chunked_prefill_outputs, + name_0="no_chunked_prefill", + name_1="chunked_prefill", + ) From c73e0275b0e5cd1e4bd35c9f0cff525a4a1a77f3 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 3 Sep 2024 13:27:06 -0700 Subject: [PATCH 3/7] revert --- .../basic_correctness/test_chunked_prefill.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 58cd20efa51e..2cd089c309f7 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -6,6 +6,7 @@ Run `pytest tests/models/test_chunked_prefill.py`. """ +from contextlib import nullcontext import pytest @@ -76,6 +77,9 @@ def test_models( # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) +# Due to low-precision numerical divergence, this test is too sensitive to +# the async postprocessor +@pytest.mark.parametrize("disable_async_output_proc", [True]) def test_models_with_fp8_kv_cache( vllm_runner, example_prompts, @@ -85,6 +89,7 @@ def test_models_with_fp8_kv_cache( chunked_prefill_token_size: int, enforce_eager: bool, tensor_parallel_size: int, + disable_async_output_proc: bool, ) -> None: """ Check output logprobs match between no_chunked_prefill and chunked_prefill @@ -102,6 +107,7 @@ def test_models_with_fp8_kv_cache( enforce_eager=enforce_eager, max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, ) as vllm_model: no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -114,6 +120,7 @@ def test_models_with_fp8_kv_cache( enforce_eager=enforce_eager, max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, ) as vllm_model: chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -124,3 +131,68 @@ def test_models_with_fp8_kv_cache( name_0="no_chunked_prefill", name_1="chunked_prefill", ) + + +@pytest.mark.parametrize("max_tokens", [16]) +@pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("chunk_size", [30, 32]) +@pytest.mark.parametrize("use_v2_block_manager", [False, True]) +# NOTE: Increasing this in this suite will fail CI because we currently cannot +# reset distributed env properly. Use a value > 1 just when you test. +@pytest.mark.parametrize("tensor_parallel_size", [1]) +def test_with_prefix_caching( + vllm_runner, + max_tokens: int, + enforce_eager: bool, + chunk_size: int, + use_v2_block_manager: bool, + tensor_parallel_size: int, +) -> None: + """ + Checks exact match decode with and without prefix caching + with chunked prefill enabled. + """ + model = "meta-llama/Llama-2-7b-chat-hf" + # The common prompt has 142 tokens with Llama-2 tokenizer. + common_prompt = "You are a helpful AI assistant " * 20 + unique_prompts = [ + "Question", # Warmup + "Question", # Fully cached + "Another question", # Partial cached + ] + full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts] + + max_num_batched_tokens = max_num_seqs = chunk_size + outputs = {} # type: ignore + check_result = True + for enable in (True, False): + with vllm_runner( + model, + dtype="half", + max_num_batched_tokens=max_num_batched_tokens, + enable_chunked_prefill=True, + enable_prefix_caching=enable, + tensor_parallel_size=tensor_parallel_size, + use_v2_block_manager=use_v2_block_manager, + enforce_eager=enforce_eager, + max_num_seqs=max_num_seqs, + ) as vllm_model: + # It should fail when prefix caching is enable and chunk + # size is not a multiple of block size (16). + should_fail = chunk_size % 16 != 0 and enable + check_result &= not should_fail + outputs[enable] = [] + # Send the request one-by-one to ensure the cache is populated. + with pytest.raises(ValueError) if should_fail else nullcontext(): + for prompt in full_prompts: + outputs[enable] += vllm_model.generate_greedy([prompt], + max_tokens) + + # Check results only if we did not expect a failure. + if check_result: + check_outputs_equal( + outputs_0_lst=outputs[False], + outputs_1_lst=outputs[True], + name_0="w/o prefix caching", + name_1="with prefix caching", + ) From 210d140789eea2e536b8ce98493e1b77279c3fa2 Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Tue, 3 Sep 2024 14:08:24 -0700 Subject: [PATCH 4/7] format --- tests/basic_correctness/test_chunked_prefill.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 2cd089c309f7..9c34b2a13fd5 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -148,17 +148,17 @@ def test_with_prefix_caching( use_v2_block_manager: bool, tensor_parallel_size: int, ) -> None: - """ - Checks exact match decode with and without prefix caching - with chunked prefill enabled. + """ + Checks exact match decode with and without prefix caching + with chunked prefill enabled. """ model = "meta-llama/Llama-2-7b-chat-hf" # The common prompt has 142 tokens with Llama-2 tokenizer. common_prompt = "You are a helpful AI assistant " * 20 unique_prompts = [ - "Question", # Warmup - "Question", # Fully cached - "Another question", # Partial cached + "Question", # Warmup + "Question", # Fully cached + "Another question", # Partial cached ] full_prompts = [f"{common_prompt}\n{p}" for p in unique_prompts] From a0017965d701aca69042fb8f9c316c0da2aa8ecf Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 4 Sep 2024 09:36:40 -0700 Subject: [PATCH 5/7] Update tests/models/test_fp8.py --- tests/models/test_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 522e346c4827..6c457408ca52 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -67,7 +67,6 @@ def test_models( with vllm_runner( base_model, max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype="auto", From 67f32a70b86dee35ecc095c45209f555796b21ba Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 4 Sep 2024 09:36:44 -0700 Subject: [PATCH 6/7] Update tests/models/test_fp8.py --- tests/models/test_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 6c457408ca52..17acdb52322f 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -82,7 +82,6 @@ def test_models( with vllm_runner( test_model, max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, tensor_parallel_size=tensor_parallel_size, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, From 911ad12eaf9591aa4159bf65bf6c5e941b29c8da Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 4 Sep 2024 09:45:59 -0700 Subject: [PATCH 7/7] disable test on CPU --- .buildkite/run-cpu-test.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 8e4be08f3aba..ca9cf15780e2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,12 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \ + --ignore=tests/models/test_oot_registration.py \ + --ignore=tests/models/test_registry.py \ + --ignore=tests/models/test_fp8.py \ + --ignore=tests/models/test_jamba.py \ + --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c "