Skip to content

Commit a5829a3

Browse files
authored
Merge branch 'main' into fa-block-size-revert
2 parents 1bc4f08 + 8ac3a41 commit a5829a3

File tree

124 files changed

+1880
-1624
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+1880
-1624
lines changed

.buildkite/scripts/annotate-release.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ To download the wheel (by version):
2323
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
2424
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
2525
26-
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
2726
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
27+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
2828
\`\`\`
2929
3030
To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4545
docker push vllm/vllm-openai:latest-aarch64
4646
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
4747
48-
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
49-
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
48+
docker manifest rm vllm/vllm-openai:latest
49+
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
50+
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
5051
docker manifest push vllm/vllm-openai:latest
5152
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
5253
\`\`\`
53-
EOF
54+
EOF

.buildkite/test-amd.yaml

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ steps:
6161
- pytest -v -s -m 'not cpu_test' multimodal
6262
- pytest -v -s utils_
6363

64-
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
64+
- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
6565
timeout_in_minutes: 10
6666
mirror_hardwares: [amdexperimental, amdproduction]
6767
agent_pool: mi325_1
@@ -73,13 +73,15 @@ steps:
7373
- tests/multimodal
7474
- tests/standalone_tests/lazy_imports.py
7575
- tests/transformers_utils
76+
- tests/config
7677
no_gpu: true
7778
commands:
7879
- python3 standalone_tests/lazy_imports.py
7980
- pytest -v -s test_inputs.py
8081
- pytest -v -s test_outputs.py
8182
- pytest -v -s -m 'cpu_test' multimodal
8283
- pytest -v -s transformers_utils
84+
- pytest -v -s config
8385

8486
- label: Python-only Installation Test # 10min
8587
timeout_in_minutes: 20
@@ -390,6 +392,15 @@ steps:
390392
commands:
391393
- pytest -v -s v1/attention
392394

395+
- label: V1 Test attention (B200) # 10min
396+
timeout_in_minutes: 30
397+
gpu: b200
398+
source_file_dependencies:
399+
- vllm/v1/attention
400+
- tests/v1/attention
401+
commands:
402+
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
403+
393404
- label: V1 Test others (CPU) # 5 mins
394405
mirror_hardwares: [amdexperimental, amdproduction]
395406
agent_pool: mi325_1
@@ -529,7 +540,7 @@ steps:
529540
- pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
530541
# Limit to no custom ops to reduce running time
531542
# Wrap with quotes to escape yaml and avoid starting -k string with a -
532-
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
543+
- "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
533544

534545
- label: Cudagraph test
535546
timeout_in_minutes: 20
@@ -694,7 +705,7 @@ steps:
694705
- vllm/model_executor/models/whisper.py
695706
commands: # LMEval
696707
# Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
697-
- pytest -s entrypoints/openai/correctness/ --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
708+
- pytest -s entrypoints/openai/correctness/
698709

699710
- label: OpenAI-Compatible Tool Use # 23 min
700711
timeout_in_minutes: 35
@@ -995,12 +1006,12 @@ steps:
9951006
optional: true
9961007
commands:
9971008
- pip install --upgrade git+https://github.com/huggingface/transformers
998-
- pytest -v -s tests/models/test_initialization.py
1009+
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
9991010
- pytest -v -s tests/models/test_transformers.py
1000-
- pytest -v -s tests/models/multimodal/processing/
1001-
- pytest -v -s tests/models/multimodal/test_mapping.py
1011+
# - pytest -v -s tests/models/multimodal/processing/
1012+
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
10021013
- python3 examples/offline_inference/basic/chat.py
1003-
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
1014+
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
10041015
# Whisper needs spawn method to avoid deadlock
10051016
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
10061017

@@ -1045,7 +1056,7 @@ steps:
10451056
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
10461057
- pytest -v -s tests/kernels/moe/test_flashinfer.py
10471058

1048-
- label: Blackwell Fusion Tests # 30 min
1059+
- label: Blackwell Fusion and Compile Tests # 30 min
10491060
timeout_in_minutes: 40
10501061
working_dir: "/vllm-workspace/"
10511062
gpu: b200
@@ -1066,7 +1077,9 @@ steps:
10661077
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
10671078
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
10681079
# Wrap with quotes to escape yaml
1069-
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
1080+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
1081+
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1082+
- pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
10701083

10711084
- label: Blackwell Fusion E2E Tests # 30 min
10721085
timeout_in_minutes: 40
@@ -1088,15 +1101,13 @@ steps:
10881101
commands:
10891102
- nvidia-smi
10901103
# Run all e2e fusion tests
1091-
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
1092-
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
1093-
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
1104+
- pytest -v -s tests/compile/test_fusions_e2e.py
10941105

10951106
- label: ROCm GPT-OSS Eval
10961107
timeout_in_minutes: 60
10971108
working_dir: "/vllm-workspace/"
10981109
agent_pool: mi325_1
1099-
mirror_hardwares: [amdproduction]
1110+
mirror_hardwares: [amdexperimental, amdproduction]
11001111
optional: true # run on nightlies
11011112
source_file_dependencies:
11021113
- tests/evals/gpt_oss
@@ -1416,7 +1427,9 @@ steps:
14161427
- pytest -v -s tests/compile/distributed/test_async_tp.py
14171428
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
14181429
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
1419-
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1430+
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
1431+
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
1432+
- pytest -v -s tests/compile/distributed/test_sequence_parallel.py
14201433
- pytest -v -s tests/distributed/test_context_parallel.py
14211434
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
14221435
- pytest -v -s tests/v1/distributed/test_dbo.py

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,6 @@ steps:
554554
timeout_in_minutes: 45
555555
gpu: h100
556556
num_gpus: 1
557-
optional: true
558557
source_file_dependencies:
559558
- tools/install_deepgemm.sh
560559
- vllm/utils/deep_gemm.py
@@ -565,10 +564,10 @@ steps:
565564
- tests/kernels/moe/test_batched_deepgemm.py
566565
- tests/kernels/attention/test_deepgemm_attention.py
567566
commands:
568-
- pytest -v -s tests/kernels/quantization/test_block_fp8.py -k deep_gemm
569-
- pytest -v -s tests/kernels/moe/test_deepgemm.py
570-
- pytest -v -s tests/kernels/moe/test_batched_deepgemm.py
571-
- pytest -v -s tests/kernels/attention/test_deepgemm_attention.py
567+
- pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
568+
- pytest -v -s kernels/moe/test_deepgemm.py
569+
- pytest -v -s kernels/moe/test_batched_deepgemm.py
570+
- pytest -v -s kernels/attention/test_deepgemm_attention.py
572571

573572
- label: Model Executor Test # 23min
574573
timeout_in_minutes: 35

0 commit comments

Comments
 (0)