Skip to content

Commit 5c71345

Browse files
khluuAkshat-Tripathi
authored andcommitted
[ci] Use env var to control whether to use S3 bucket in CI (vllm-project#13634)
1 parent e5ad78f commit 5c71345

30 files changed

+222
-231
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ steps:
278278
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
279279
parallelism: 4
280280

281-
- label: "PyTorch Fullgraph Smoke Test" # 9min
281+
- label: PyTorch Fullgraph Smoke Test # 9min
282282
fast_check: true
283283
source_file_dependencies:
284284
- vllm/
@@ -289,7 +289,7 @@ steps:
289289
- pytest -v -s compile/piecewise/test_simple.py
290290
- pytest -v -s compile/piecewise/test_toy_llama.py
291291

292-
- label: "PyTorch Fullgraph Test" # 18min
292+
- label: PyTorch Fullgraph Test # 18min
293293
source_file_dependencies:
294294
- vllm/
295295
- tests/compile

tests/basic_correctness/test_basic_correctness.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import pytest
1010

1111
from vllm import LLM
12-
from vllm.config import LoadFormat
1312
from vllm.platforms import current_platform
1413

1514
from ..conftest import VllmRunner
@@ -34,7 +33,7 @@ def v1(run_with_both_engines):
3433

3534
def test_vllm_gc_ed():
3635
"""Verify vllm instance is GC'ed when it is deleted"""
37-
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
36+
llm = LLM("distilbert/distilgpt2")
3837
weak_llm = weakref.ref(llm)
3938
del llm
4039
# If there's any circular reference to vllm, this fails
@@ -43,10 +42,10 @@ def test_vllm_gc_ed():
4342

4443

4544
@pytest.mark.parametrize("model", MODELS)
46-
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
45+
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
4746
@pytest.mark.parametrize("dtype", ["half"])
4847
@pytest.mark.parametrize("max_tokens", [5])
49-
@pytest.mark.parametrize("enforce_eager", [False, True])
48+
@pytest.mark.parametrize("enforce_eager", [False])
5049
def test_models(
5150
hf_runner,
5251
model: str,
@@ -97,8 +96,8 @@ def test_models(
9796
"test_suite", [
9897
("distilbert/distilgpt2", "ray", "", "L4"),
9998
("distilbert/distilgpt2", "mp", "", "L4"),
100-
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
101-
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
99+
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
100+
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
102101
("distilbert/distilgpt2", "ray", "", "A100"),
103102
("distilbert/distilgpt2", "mp", "", "A100"),
104103
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),

tests/basic_correctness/test_cumem.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44
import torch
55

66
from vllm import LLM, SamplingParams
7-
from vllm.config import LoadFormat
87
from vllm.device_allocator.cumem import CuMemAllocator
98
from vllm.utils import GiB_bytes
109

11-
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
1210
from ..utils import fork_new_process_for_each_test
1311

1412

@@ -121,7 +119,7 @@ def model(x):
121119
"model, use_v1",
122120
[
123121
# sleep mode with safetensors
124-
(f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
122+
("meta-llama/Llama-3.2-1B", True),
125123
# sleep mode with pytorch checkpoint
126124
("facebook/opt-125m", False),
127125
])
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
130128
os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
131129
free, total = torch.cuda.mem_get_info()
132130
used_bytes_baseline = total - free # in case other process is running
133-
load_format = LoadFormat.AUTO
134-
if "Llama" in model:
135-
load_format = LoadFormat.RUNAI_STREAMER
136-
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
131+
llm = LLM(model, enable_sleep_mode=True)
137132
prompt = "How are you?"
138133
sampling_params = SamplingParams(temperature=0, max_tokens=10)
139134
output = llm.generate(prompt, sampling_params)

tests/conftest.py

Lines changed: 1 addition & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from vllm import LLM, SamplingParams
2525
from vllm.assets.image import ImageAsset
2626
from vllm.assets.video import VideoAsset
27-
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
27+
from vllm.config import TaskOption, TokenizerPoolConfig
2828
from vllm.connections import global_http_connection
2929
from vllm.distributed import (cleanup_dist_env_and_memory,
3030
init_distributed_environment,
@@ -47,70 +47,6 @@
4747

4848
_M = TypeVar("_M")
4949

50-
MODELS_ON_S3 = [
51-
"distilbert/distilgpt2",
52-
"meta-llama/Llama-2-7b-hf",
53-
"meta-llama/Meta-Llama-3-8B",
54-
"meta-llama/Llama-3.2-1B",
55-
"meta-llama/Llama-3.2-1B-Instruct",
56-
"openai-community/gpt2",
57-
"ArthurZ/Ilama-3.2-1B",
58-
"llava-hf/llava-1.5-7b-hf",
59-
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
60-
"ai21labs/Jamba-tiny-random",
61-
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
62-
"nm-testing/Phi-3-mini-128k-instruct-FP8",
63-
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
64-
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
65-
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
66-
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
67-
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
68-
"AMead10/Llama-3.2-1B-Instruct-AWQ",
69-
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
70-
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
71-
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
72-
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
73-
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
74-
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
75-
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
76-
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
77-
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
78-
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
79-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
80-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
81-
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
82-
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
83-
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
84-
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
85-
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
86-
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
87-
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
88-
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
89-
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
90-
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
91-
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
92-
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
93-
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
94-
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
95-
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
96-
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
97-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
98-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
99-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
100-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
101-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
102-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
103-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
104-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
105-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
106-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
107-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
108-
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
109-
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
110-
]
111-
112-
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
113-
11450
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
11551

11652
PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -742,14 +678,8 @@ def __init__(
742678
enable_chunked_prefill: bool = False,
743679
swap_space: int = 4,
744680
enforce_eager: Optional[bool] = False,
745-
load_format: Optional[LoadFormat] = None,
746681
**kwargs,
747682
) -> None:
748-
if model_name in MODELS_ON_S3 and not load_format:
749-
model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
750-
load_format = LoadFormat.RUNAI_STREAMER
751-
if not load_format:
752-
load_format = LoadFormat.AUTO
753683
self.model = LLM(
754684
model=model_name,
755685
task=task,
@@ -764,7 +694,6 @@ def __init__(
764694
max_model_len=max_model_len,
765695
block_size=block_size,
766696
enable_chunked_prefill=enable_chunked_prefill,
767-
load_format=load_format,
768697
**kwargs,
769698
)
770699

tests/engine/test_computed_prefix_blocks.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,12 @@
22

33
import pytest
44

5-
from vllm.config import LoadFormat
65
from vllm.engine.arg_utils import EngineArgs
76
from vllm.engine.llm_engine import LLMEngine
87
from vllm.sampling_params import SamplingParams
98

10-
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
119

12-
13-
@pytest.mark.parametrize("model",
14-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
10+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
1511
@pytest.mark.parametrize("block_size", [16])
1612
def test_computed_prefix_blocks(model: str, block_size: int):
1713
# This test checks if we are able to run the engine to completion
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
2824
"decoration.")
2925

3026
engine_args = EngineArgs(model=model,
31-
load_format=LoadFormat.RUNAI_STREAMER,
3227
block_size=block_size,
3328
enable_prefix_caching=True)
3429

tests/engine/test_detokenization.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,11 @@
22

33
import pytest
44

5-
from vllm.config import LoadFormat
65
from vllm.entrypoints.llm import LLM
76
from vllm.sampling_params import SamplingParams
87

9-
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
108

11-
12-
@pytest.mark.parametrize("model",
13-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
9+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
1410
def test_computed_prefix_blocks(model: str):
1511
# This test checks if the engine generates completions both with and
1612
# without optional detokenization, that detokenization includes text
@@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
2117
"paper clips? Is there an easy to follow video tutorial available "
2218
"online for free?")
2319

24-
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
20+
llm = LLM(model=model)
2521
sampling_params = SamplingParams(max_tokens=10,
2622
temperature=0.0,
2723
detokenize=False)

tests/engine/test_executor.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,12 @@
66

77
import pytest
88

9-
from vllm.config import LoadFormat
109
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
1110
from vllm.engine.async_llm_engine import AsyncLLMEngine
1211
from vllm.engine.llm_engine import LLMEngine
1312
from vllm.executor.uniproc_executor import UniProcExecutor
1413
from vllm.sampling_params import SamplingParams
1514

16-
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
17-
18-
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
19-
2015

2116
class Mock:
2217
...
@@ -38,12 +33,10 @@ def collective_rpc(self,
3833
CustomUniExecutorAsync = CustomUniExecutor
3934

4035

41-
@pytest.mark.parametrize("model",
42-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
36+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
4337
def test_custom_executor_type_checking(model):
4438
with pytest.raises(ValueError):
4539
engine_args = EngineArgs(model=model,
46-
load_format=RUNAI_STREAMER_LOAD_FORMAT,
4740
distributed_executor_backend=Mock)
4841
LLMEngine.from_engine_args(engine_args)
4942
with pytest.raises(ValueError):
@@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
5245
AsyncLLMEngine.from_engine_args(engine_args)
5346

5447

55-
@pytest.mark.parametrize("model",
56-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
48+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
5749
def test_custom_executor(model, tmp_path):
5850
cwd = os.path.abspath(".")
5951
os.chdir(tmp_path)
@@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
6254

6355
engine_args = EngineArgs(
6456
model=model,
65-
load_format=RUNAI_STREAMER_LOAD_FORMAT,
6657
distributed_executor_backend=CustomUniExecutor,
6758
enforce_eager=True, # reduce test time
6859
)
@@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
7768
os.chdir(cwd)
7869

7970

80-
@pytest.mark.parametrize("model",
81-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
71+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
8272
def test_custom_executor_async(model, tmp_path):
8373
cwd = os.path.abspath(".")
8474
os.chdir(tmp_path)
@@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
8777

8878
engine_args = AsyncEngineArgs(
8979
model=model,
90-
load_format=RUNAI_STREAMER_LOAD_FORMAT,
9180
distributed_executor_backend=CustomUniExecutorAsync,
9281
enforce_eager=True, # reduce test time
9382
)
@@ -106,8 +95,7 @@ async def t():
10695
os.chdir(cwd)
10796

10897

109-
@pytest.mark.parametrize("model",
110-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
98+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
11199
def test_respect_ray(model):
112100
# even for TP=1 and PP=1,
113101
# if users specify ray, we should use ray.
@@ -116,7 +104,6 @@ def test_respect_ray(model):
116104
engine_args = EngineArgs(
117105
model=model,
118106
distributed_executor_backend="ray",
119-
load_format=RUNAI_STREAMER_LOAD_FORMAT,
120107
enforce_eager=True, # reduce test time
121108
)
122109
engine = LLMEngine.from_engine_args(engine_args)

tests/engine/test_skip_tokenizer_init.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,19 @@
22

33
import pytest
44

5-
from vllm.config import LoadFormat
65
from vllm.entrypoints.llm import LLM
76
from vllm.sampling_params import SamplingParams
87

9-
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
108

11-
12-
@pytest.mark.parametrize("model",
13-
[f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
9+
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
1410
def test_skip_tokenizer_initialization(model: str):
1511
# This test checks if the flag skip_tokenizer_init skips the initialization
1612
# of tokenizer and detokenizer. The generated output is expected to contain
1713
# token ids.
18-
llm = LLM(model=model,
19-
skip_tokenizer_init=True,
20-
load_format=LoadFormat.RUNAI_STREAMER)
14+
llm = LLM(
15+
model=model,
16+
skip_tokenizer_init=True,
17+
)
2118
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
2219

2320
with pytest.raises(ValueError, match="cannot pass text prompts when"):

tests/entrypoints/llm/test_chat.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,12 @@
55
import pytest
66

77
from vllm import LLM
8-
from vllm.config import LoadFormat
98

10-
from ...conftest import MODEL_WEIGHTS_S3_BUCKET
119
from ..openai.test_vision import TEST_IMAGE_URLS
1210

13-
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
14-
1511

1612
def test_chat():
17-
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
18-
load_format=RUNAI_STREAMER_LOAD_FORMAT)
13+
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
1914

2015
prompt1 = "Explain the concept of entropy."
2116
messages = [
@@ -33,8 +28,7 @@ def test_chat():
3328

3429

3530
def test_multi_chat():
36-
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
37-
load_format=RUNAI_STREAMER_LOAD_FORMAT)
31+
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
3832

3933
prompt1 = "Explain the concept of entropy."
4034
prompt2 = "Explain what among us is."
@@ -71,8 +65,7 @@ def test_multi_chat():
7165
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
7266
def test_chat_multi_image(image_urls: List[str]):
7367
llm = LLM(
74-
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
75-
load_format=RUNAI_STREAMER_LOAD_FORMAT,
68+
model="microsoft/Phi-3.5-vision-instruct",
7669
dtype="bfloat16",
7770
max_model_len=4096,
7871
max_num_seqs=5,

0 commit comments

Comments
 (0)