Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
- label: PyTorch Fullgraph Smoke Test # 9min
fast_check: true
source_file_dependencies:
- vllm/
Expand All @@ -289,7 +289,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py

- label: "PyTorch Fullgraph Test" # 18min
- label: PyTorch Fullgraph Test # 18min
source_file_dependencies:
- vllm/
- tests/compile
Expand Down
7 changes: 4 additions & 3 deletions csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
#if defined ENABLE_NVFP4 && ENABLE_NVFP4
return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
#endif
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel, vLLM should "
"be compiled using CUDA 12.8 and target "
"compute capability 100 or above.");
TORCH_CHECK_NOT_IMPLEMENTED(false,
"No compiled nvfp4 mm kernel, vLLM should "
"be compiled using CUDA 12.8 and target "
"compute capability 100 or above.");
}
5 changes: 3 additions & 2 deletions csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,9 @@ void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
at::Tensor const& A_sf, at::Tensor const& B_sf,
at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
cudaStream_t stream) {
TORCH_CHECK(false, "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
"a CUTLASS 3.8 source directory to enable support.");
TORCH_CHECK(false,
"Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
"a CUTLASS 3.8 source directory to enable support.");
}
#endif // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)

Expand Down
11 changes: 5 additions & 6 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pytest

from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform

from ..conftest import VllmRunner
Expand All @@ -34,7 +33,7 @@ def v1(run_with_both_engines):

def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
llm = LLM("distilbert/distilgpt2")
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
Expand All @@ -43,10 +42,10 @@ def test_vllm_gc_ed():


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
hf_runner,
model: str,
Expand Down Expand Up @@ -97,8 +96,8 @@ def test_models(
"test_suite", [
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
Expand Down
9 changes: 2 additions & 7 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
import torch

from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes

from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test


Expand Down Expand Up @@ -121,7 +119,7 @@ def model(x):
"model, use_v1",
[
# sleep mode with safetensors
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
("meta-llama/Llama-3.2-1B", True),
# sleep mode with pytorch checkpoint
("facebook/opt-125m", False),
])
Expand All @@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
llm = LLM(model, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
Expand Down
73 changes: 1 addition & 72 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
Expand All @@ -47,70 +47,6 @@

_M = TypeVar("_M")

MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"ai21labs/Jamba-tiny-random",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
"AMead10/Llama-3.2-1B-Instruct-AWQ",
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
]

MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
Expand Down Expand Up @@ -742,14 +678,8 @@ def __init__(
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs,
) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM(
model=model_name,
task=task,
Expand All @@ -764,7 +694,6 @@ def __init__(
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs,
)

Expand Down
7 changes: 1 addition & 6 deletions tests/engine/test_computed_prefix_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,12 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
Expand All @@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.")

engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size,
enable_prefix_caching=True)

Expand Down
8 changes: 2 additions & 6 deletions tests/engine/test_detokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
Expand All @@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")

llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
llm = LLM(model=model)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
Expand Down
21 changes: 4 additions & 17 deletions tests/engine/test_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,12 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER


class Mock:
...
Expand All @@ -38,12 +33,10 @@ def collective_rpc(self,
CustomUniExecutorAsync = CustomUniExecutor


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_custom_executor_type_checking(model):
with pytest.raises(ValueError):
engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError):
Expand All @@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args)


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):

engine_args = EngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
)
Expand All @@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd)


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):

engine_args = AsyncEngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
)
Expand All @@ -106,8 +95,7 @@ async def t():
os.chdir(cwd)


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
Expand All @@ -116,7 +104,6 @@ def test_respect_ray(model):
engine_args = EngineArgs(
model=model,
distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time
)
engine = LLMEngine.from_engine_args(engine_args)
Expand Down
13 changes: 5 additions & 8 deletions tests/engine/test_skip_tokenizer_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,19 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET


@pytest.mark.parametrize("model",
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm = LLM(model=model,
skip_tokenizer_init=True,
load_format=LoadFormat.RUNAI_STREAMER)
llm = LLM(
model=model,
skip_tokenizer_init=True,
)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)

with pytest.raises(ValueError, match="cannot pass text prompts when"):
Expand Down
Loading