Skip to content

Commit 12adc56

Browse files
committed
Merge remote-tracking branch 'upstream/main' into sharded-tensorizer
* upstream/main: (126 commits) [Bugfix][Frontend] Cleanup "fix chat logprobs" (vllm-project#5026) [Bugfix] OpenAI entrypoint limits logprobs while ignoring server defined --max-logprobs (vllm-project#5312) [Misc] Various simplifications and typing fixes (vllm-project#5368) [ci] Fix Buildkite agent path (vllm-project#5392) [Doc] Add documentation for FP8 W8A8 (vllm-project#5388) Bump version to v0.5.0 (vllm-project#5384) [Docs] Alphabetically sort sponsors (vllm-project#5386) [Docs] Add Docs on Limitations of VLM Support (vllm-project#5383) [ci] Mount buildkite agent on Docker container to upload benchmark results (vllm-project#5330) [ci] Use small_cpu_queue for doc build (vllm-project#5331) [Bugfix] Fix LLaVA-NeXT (vllm-project#5380) [Feature][Frontend]: Continued `stream_options` implementation also in CompletionRequest (vllm-project#5319) [Model] Initial support for LLaVA-NeXT (vllm-project#4199) [Misc] Improve error message when LoRA parsing fails (vllm-project#5194) [misc][typo] fix typo (vllm-project#5372) [Frontend][Misc] Enforce Pixel Values as Input Type for VLMs in API Server (vllm-project#5374) [Misc] Update to comply with the new `compressed-tensors` config (vllm-project#5350) [Bugfix] Fix KeyError: 1 When Using LoRA adapters (vllm-project#5164) [Kernel][Misc] Use TORCH_LIBRARY instead of PYBIND11_MODULE for custom ops (vllm-project#5047) [mis][ci/test] fix flaky test in test_sharded_state_loader.py (vllm-project#5361) ...
2 parents 102c8a1 + 640052b commit 12adc56

File tree

269 files changed

+12084
-3334
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

269 files changed

+12084
-3334
lines changed

.buildkite/check-wheel-size.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
import zipfile
33

4-
MAX_SIZE_MB = 150
4+
MAX_SIZE_MB = 200
55

66

77
def print_top_10_largest_files(zip_file):
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
# Install system packages
6+
apt update
7+
apt install -y curl jq
8+
9+
# Install minijinja for templating
10+
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
11+
source $HOME/.cargo/env
12+
13+
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
14+
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
15+
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
16+
17+
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
18+
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
19+
else
20+
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
21+
exit 0
22+
fi
23+
fi
24+
25+
# Upload sample.yaml
26+
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
steps:
2+
# NOTE(simon): You can create separate blocks for different jobs
3+
- label: "A100: NVIDIA SMI"
4+
agents:
5+
queue: A100
6+
plugins:
7+
- kubernetes:
8+
podSpec:
9+
containers:
10+
# - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
11+
# TODO(simon): check latest main branch or use the PR image.
12+
- image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
13+
command:
14+
- bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
15+
resources:
16+
limits:
17+
nvidia.com/gpu: 8
18+
volumeMounts:
19+
- name: devshm
20+
mountPath: /dev/shm
21+
nodeSelector:
22+
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
23+
volumes:
24+
- name: devshm
25+
emptyDir:
26+
medium: Memory
27+
# TODO(simon): bring H100 online
28+
# - label: "H100: NVIDIA SMI"
29+
# agents:
30+
# queue: H100
31+
# plugins:
32+
# - docker#v5.11.0:
33+
# image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
34+
# command:
35+
# - bash -c 'nvidia-smi && nvidia-smi topo -m'
36+
# propagate-environment: true
37+
# ipc: host
38+
# gpus: all
39+

.buildkite/run-amd-test.sh

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,34 @@ set -ex
55
echo "--- ROCm info"
66
rocminfo
77

8+
# cleanup older docker images
9+
cleanup_docker() {
10+
# Get Docker's root directory
11+
docker_root=$(docker info -f '{{.DockerRootDir}}')
12+
if [ -z "$docker_root" ]; then
13+
echo "Failed to determine Docker root directory."
14+
exit 1
15+
fi
16+
echo "Docker root directory: $docker_root"
17+
# Check disk usage of the filesystem where Docker's root directory is located
18+
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
19+
# Define the threshold
20+
threshold=70
21+
if [ "$disk_usage" -gt "$threshold" ]; then
22+
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
23+
# Remove dangling images (those that are not tagged and not used by any container)
24+
docker image prune -f
25+
# Remove unused volumes
26+
docker volume prune -f
27+
echo "Docker images and volumes cleanup completed."
28+
else
29+
echo "Disk usage is below $threshold%. No cleanup needed."
30+
fi
31+
}
32+
33+
# Call the cleanup docker function
34+
cleanup_docker
35+
836
echo "--- Resetting GPUs"
937

1038
echo "reset" > /opt/amdgpu/etc/gpu_state

.buildkite/run-benchmarks.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
5050
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
5151
echo "" >> benchmark_results.md
5252
echo '```' >> benchmark_results.md
53-
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
53+
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
5454
echo '```' >> benchmark_results.md
5555

5656
# if the agent binary is not found, skip uploading the results, exit 0
57-
if [ ! -f /workspace/buildkite-agent ]; then
57+
if [ ! -f /usr/bin/buildkite-agent ]; then
5858
exit 0
5959
fi
6060

6161
# upload the results to buildkite
62-
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
62+
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
6363

6464
# exit with the exit code of the benchmarks
6565
if [ $bench_latency_exit_code -ne 0 ]; then
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
7575
fi
7676

7777
rm ShareGPT_V3_unfiltered_cleaned_split.json
78-
/workspace/buildkite-agent artifact upload "*.json"
78+
buildkite-agent artifact upload "*.json"

.buildkite/run-cpu-test.sh

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
1010
trap remove_docker_container EXIT
1111
remove_docker_container
1212

13-
# Run the image and launch offline inference
14-
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
13+
# Run the image
14+
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
15+
16+
# offline inference
17+
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
18+
19+
# Run basic model test
20+
docker exec cpu-test bash -c "cd tests;
21+
pip install pytest Pillow protobuf
22+
bash ../.buildkite/download-images.sh
23+
cd ../
24+
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"

.buildkite/test-pipeline.yaml

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ steps:
3737
working_dir: "/vllm-workspace/tests"
3838
num_gpus: 2
3939
commands:
40-
- pytest -v -s distributed/test_pynccl_library.py
4140
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
4241
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
4342
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
@@ -46,7 +45,8 @@ steps:
4645
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
4746
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
4847
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
49-
- pytest -v -s spec_decode/e2e/test_integration_dist.py
48+
- pytest -v -s spec_decode/e2e/test_integration_dist.py
49+
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
5050

5151
- label: Distributed Tests (Multiple Groups)
5252
#mirror_hardwares: [amd]
@@ -63,7 +63,6 @@ steps:
6363
mirror_hardwares: [amd]
6464

6565
commands:
66-
- pytest -v -s test_inputs.py
6766
- pytest -v -s entrypoints -m llm
6867
- pytest -v -s entrypoints -m openai
6968

@@ -80,6 +79,13 @@ steps:
8079
- python3 llava_example.py
8180
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
8281

82+
- label: Inputs Test
83+
#mirror_hardwares: [amd]
84+
commands:
85+
- bash ../.buildkite/download-images.sh
86+
- pytest -v -s test_inputs.py
87+
- pytest -v -s multimodal
88+
8389
- label: Kernels Test %N
8490
#mirror_hardwares: [amd]
8591
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -88,14 +94,13 @@ steps:
8894
- label: Models Test
8995
#mirror_hardwares: [amd]
9096
commands:
91-
- bash ../.buildkite/download-images.sh
92-
- pytest -v -s models --ignore=models/test_llava.py
97+
- pytest -v -s models -m \"not llava\"
9398

9499
- label: Llava Test
95100
mirror_hardwares: [amd]
96101
commands:
97102
- bash ../.buildkite/download-images.sh
98-
- pytest -v -s models/test_llava.py
103+
- pytest -v -s models -m llava
99104

100105
- label: Prefix Caching Test
101106
mirror_hardwares: [amd]
@@ -119,7 +124,10 @@ steps:
119124

120125
- label: Speculative decoding tests
121126
#mirror_hardwares: [amd]
122-
command: pytest -v -s spec_decode
127+
commands:
128+
# See https://github.com/vllm-project/vllm/issues/5152
129+
- export VLLM_ATTENTION_BACKEND=XFORMERS
130+
- pytest -v -s spec_decode
123131

124132
- label: LoRA Test %N
125133
#mirror_hardwares: [amd]
@@ -131,14 +139,7 @@ steps:
131139
num_gpus: 4
132140
# This test runs llama 13B, so it is required to run on 4 GPUs.
133141
commands:
134-
# Temporarily run this way because we cannot clean up GPU mem usage
135-
# for multi GPU tests.
136-
# TODO(sang): Fix it.
137-
- pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
138-
- pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
139-
- pytest -v -s lora/test_long_context.py::test_self_consistency
140-
- pytest -v -s lora/test_long_context.py::test_quality
141-
- pytest -v -s lora/test_long_context.py::test_max_len
142+
- pytest -v -s -x lora/test_long_context.py
142143

143144
- label: Tensorizer Test
144145
#mirror_hardwares: [amd]

.buildkite/test-template-aws.j2

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
2+
{% set default_working_dir = "/vllm-workspace/tests" %}
3+
4+
steps:
5+
- label: ":docker: build image"
6+
agents:
7+
queue: cpu_queue
8+
commands:
9+
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
10+
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
11+
- "docker push {{ docker_image }}"
12+
env:
13+
DOCKER_BUILDKIT: "1"
14+
retry:
15+
automatic:
16+
- exit_status: -1 # Agent was lost
17+
limit: 5
18+
- exit_status: -10 # Agent was lost
19+
limit: 5
20+
- wait
21+
22+
{% for step in steps %}
23+
- label: "{{ step.label }}"
24+
agents:
25+
{% if step.label == "Documentation Build" %}
26+
queue: small_cpu_queue
27+
{% elif step.no_gpu %}
28+
queue: cpu_queue
29+
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
30+
queue: gpu_4_queue
31+
{% else %}
32+
queue: gpu_1_queue
33+
{% endif %}
34+
soft_fail: true
35+
{% if step.parallelism %}
36+
parallelism: {{ step.parallelism }}
37+
{% endif %}
38+
retry:
39+
automatic:
40+
- exit_status: -1 # Agent was lost
41+
limit: 5
42+
- exit_status: -10 # Agent was lost
43+
limit: 5
44+
plugins:
45+
- docker#v5.2.0:
46+
image: {{ docker_image }}
47+
always-pull: true
48+
propagate-environment: true
49+
{% if not step.no_gpu %}
50+
gpus: all
51+
{% endif %}
52+
{% if step.label == "Benchmarks" %}
53+
mount-buildkite-agent: true
54+
{% endif %}
55+
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
56+
environment:
57+
- VLLM_USAGE_SOURCE=ci-test
58+
- HF_TOKEN
59+
{% if step.label == "Speculative decoding tests" %}
60+
- VLLM_ATTENTION_BACKEND=XFORMERS
61+
{% endif %}
62+
volumes:
63+
- /dev/shm:/dev/shm
64+
{% endfor %}

.buildkite/test-template.j2

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
steps:
66
- label: ":docker: build image"
7-
commands:
7+
commands:
88
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
99
- "docker push {{ docker_image }}"
1010
env:
@@ -28,6 +28,7 @@ steps:
2828
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
2929
env:
3030
DOCKER_BUILDKIT: "1"
31+
soft_fail: true
3132
{% endif %}
3233
{% endfor %}
3334

@@ -36,10 +37,12 @@ steps:
3637
agents:
3738
queue: neuron
3839
command: bash .buildkite/run-neuron-test.sh
39-
soft_fail: true
40+
soft_fail: false
4041

4142
- label: "Intel Test"
4243
depends_on: ~
44+
agents:
45+
queue: intel
4346
command: bash .buildkite/run-cpu-test.sh
4447

4548
{% for step in steps %}

.github/workflows/mypy.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ jobs:
3737
mypy vllm/distributed --config-file pyproject.toml
3838
mypy vllm/entrypoints --config-file pyproject.toml
3939
mypy vllm/executor --config-file pyproject.toml
40+
mypy vllm/multimodal --config-file pyproject.toml
4041
mypy vllm/usage --config-file pyproject.toml
4142
mypy vllm/*.py --config-file pyproject.toml
4243
mypy vllm/transformers_utils --config-file pyproject.toml

0 commit comments

Comments
 (0)