Skip to content

Commit 793b065

Browse files
authored
Merge branch 'marlin-moe-zero-points' into awq_moe
2 parents b54b633 + e0e5a74 commit 793b065

527 files changed

Lines changed: 31616 additions & 10662 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
2+
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
3+
tasks:
4+
- name: "gsm8k"
5+
metrics:
6+
- name: "exact_match,strict-match"
7+
value: 0.764
8+
- name: "exact_match,flexible-extract"
9+
value: 0.764
10+
limit: 250
11+
num_fewshot: 5

.buildkite/lm-eval-harness/configs/models-small.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
Meta-Llama-3-8B-Instruct.yaml
22
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
33
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
4+
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
45
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
56
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
67
Minitron-4B-Base-FP8.yaml

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,15 @@ def test_lm_eval_correctness():
4949
results = launch_lm_eval(eval_config)
5050

5151
# Confirm scores match ground truth.
52+
success = True
5253
for task in eval_config["tasks"]:
5354
for metric in task["metrics"]:
5455
ground_truth = metric["value"]
5556
measured_value = results["results"][task["name"]][metric["name"]]
5657
print(f'{task["name"]} | {metric["name"]}: '
5758
f'ground_truth={ground_truth} | measured={measured_value}')
58-
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
59+
success = success and numpy.isclose(
60+
ground_truth, measured_value, rtol=RTOL)
61+
62+
# Assert at the end, print all scores even on failure for debugging.
63+
assert success

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@ steps:
88
containers:
99
- image: badouralix/curl-jq
1010
command:
11-
- sh
12-
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
11+
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
1312
- wait
1413
- label: "A100"
1514
agents:

.buildkite/nightly-benchmarks/scripts/wait-for-image.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
33
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
44

5+
TIMEOUT_SECONDS=10
6+
57
retries=0
68
while [ $retries -lt 1000 ]; do
7-
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
9+
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
810
exit 0
911
fi
1012

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@ steps:
88
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
99
# rename the files to change linux -> manylinux1
1010
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
11-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
12-
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
11+
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
12+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
13+
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
1314
env:
1415
DOCKER_BUILDKIT: "1"
1516

.buildkite/run-amd-test.sh

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,47 @@ mkdir -p ${HF_CACHE}
7171
HF_MOUNT="/root/.cache/huggingface"
7272

7373
commands=$@
74+
echo "Commands:$commands"
75+
#ignore certain kernels tests
76+
if [[ $commands == *" kernels "* ]]; then
77+
commands="${commands} \
78+
--ignore=kernels/test_attention.py \
79+
--ignore=kernels/test_attention_selector.py \
80+
--ignore=kernels/test_blocksparse_attention.py \
81+
--ignore=kernels/test_causal_conv1d.py \
82+
--ignore=kernels/test_cutlass.py \
83+
--ignore=kernels/test_encoder_decoder_attn.py \
84+
--ignore=kernels/test_flash_attn.py \
85+
--ignore=kernels/test_flashinfer.py \
86+
--ignore=kernels/test_gguf.py \
87+
--ignore=kernels/test_int8_quant.py \
88+
--ignore=kernels/test_machete_gemm.py \
89+
--ignore=kernels/test_mamba_ssm.py \
90+
--ignore=kernels/test_marlin_gemm.py \
91+
--ignore=kernels/test_moe.py \
92+
--ignore=kernels/test_prefix_prefill.py \
93+
--ignore=kernels/test_rand.py \
94+
--ignore=kernels/test_sampler.py"
95+
fi
96+
97+
#ignore certain Entrypoints tests
98+
if [[ $commands == *" entrypoints/openai "* ]]; then
99+
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
100+
--ignore=entrypoints/openai/test_accuracy.py \
101+
--ignore=entrypoints/openai/test_audio.py \
102+
--ignore=entrypoints/openai/test_encoder_decoder.py \
103+
--ignore=entrypoints/openai/test_embedding.py \
104+
--ignore=entrypoints/openai/test_oot_registration.py "}
105+
fi
106+
74107
PARALLEL_JOB_COUNT=8
75108
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
76109
if [[ $commands == *"--shard-id="* ]]; then
77110
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
78111
#replace shard arguments
79-
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
112+
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
80113
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
114+
echo "Shard ${GPU} commands:$commands"
81115
docker run \
82116
--device /dev/kfd --device /dev/dri \
83117
--network host \

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image, setting --shm-size=4g for tensor parallel.
14+
source /etc/environment
1415
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
15-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test
16+
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
1617

1718
# Run basic model test
1819
docker exec cpu-test bash -c "

.buildkite/run-cpu-test.sh

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,17 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
2222

2323
# Run basic model test
2424
docker exec cpu-test bash -c "
25-
pip install pytest matplotlib einops transformers_stream_generator
26-
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \
27-
--ignore=tests/models/test_oot_registration.py \
28-
--ignore=tests/models/test_registry.py \
29-
--ignore=tests/models/test_fp8.py \
30-
--ignore=tests/models/test_jamba.py \
31-
--ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
25+
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
26+
pytest -v -s tests/models/decoder_only/language \
27+
--ignore=tests/models/test_fp8.py \
28+
--ignore=tests/models/decoder_only/language/test_jamba.py \
29+
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
30+
31+
# Run compressed-tensor test
32+
docker exec cpu-test bash -c "
33+
pytest -s -v \
34+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
35+
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token"
3236

3337
# online inference
3438
docker exec cpu-test bash -c "

.buildkite/run-xpu-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
1111
remove_docker_container
1212

1313
# Run the image and launch offline inference
14-
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
14+
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py

0 commit comments

Comments
 (0)