vLLM Benchmark #1625

Workflow file for this run

.github/workflows/vllm-benchmark.yml at 5f4b28d

	name: vLLM Benchmark

	on:
	schedule:
	# Run daily at 1:15 AM PST
	- cron: '15 9 * * *'
	workflow_dispatch:
	inputs:
	vllm_branch:
	description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request)
	required: true
	type: string
	default: main
	vllm_commit:
	description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked)
	required: false
	type: string
	models:
	description: \|
	A comma-separated list of models from vllm-benchmarks/benchmarks (optional, default to run everything)
	required: false
	type: string
	runners:
	description: \|
	A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
	required: true
	type: string
	default: rocm,spr,gnr,gaudi3,m8g
	hf_offline:
	description: Run with HuggingFace offline mode (set TRANSFORMERS_OFFLINE=1)
	required: false
	type: boolean
	default: false

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
	cancel-in-progress: true

	jobs:
	set-parameters:
	runs-on: ubuntu-latest
	outputs:
	benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- uses: actions/setup-python@v5
	with:
	python-version: '3.12'

	- name: Set parameters
	id: set-parameters
	shell: bash
	env:
	MODELS: ${{ inputs.models \|\| '' }}
	RUNNERS: ${{ inputs.runners \|\| '' }}
	run: \|
	set -eux

	# The generated matrix is grouped by model and runner
	python .github/scripts/generate_vllm_benchmark_matrix.py \
	--benchmark-configs-dir vllm-benchmarks/benchmarks \
	--models "${MODELS}" \
	--runners "${RUNNERS}"

	benchmarks:
	name: Run vLLM benchmarks
	needs: set-parameters
	if: ${{ !github.event.pull_request.head.repo.fork && github.repository_owner == 'pytorch' }}
	strategy:
	matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }}
	fail-fast: false
	runs-on: ${{ matrix.runner }}
	environment: pytorch-x-vllm
	permissions:
	id-token: write
	contents: read
	steps:
	- name: Fix workspace permissions
	run: \|
	sudo -n chown -R $USER:$USER $GITHUB_WORKSPACE \|\| true

	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Checkout vLLM repository
	uses: actions/checkout@v4
	with:
	repository: vllm-project/vllm
	path: vllm-benchmarks/vllm
	ref: ${{ inputs.vllm_branch \|\| 'main' }}
	fetch-depth: 0

	- uses: actions/setup-python@v5
	# Amazon Linux fails on this step
	continue-on-error: true
	with:
	python-version: '3.12'
	cache: 'pip'

	- name: Get workflow job id
	id: get-job-id
	uses: pytorch/test-infra/.github/actions/get-workflow-job-id@36562d6c43fa914f7bdef67ce23e5c31f1387b2e
	with:
	github-token: ${{ secrets.GITHUB_TOKEN }}

	- name: Check if the device is supported
	shell: bash
	run: \|
	set -eux

	if command -v nvidia-smi; then
	DEVICE_NAME=cuda
	nvidia-smi
	elif command -v rocm-smi; then
	DEVICE_NAME=rocm
	rocm-smi
	elif command -v hl-smi; then
	DEVICE_NAME=hpu
	hl-smi
	else
	arch=$(uname -m)

	case "$arch" in
	aarch64\|arm64)
	DEVICE_NAME=arm64-cpu
	;;
	*)
	DEVICE_NAME=cpu
	;;
	esac
	lscpu
	fi
	echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV

	- name: Set GPU name and type
	working-directory: vllm-benchmarks
	shell: bash
	run: \|
	set -eux

	if [[ "${DEVICE_NAME}" == "cuda" ]]; then
	# Return the same device name as PyTorch
	DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader)
	elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
	DEVICE_TYPE=$(rocminfo \| grep "Marketing Name" \| tail -n1 \| awk -F':' '{print $2}' \| xargs)
	elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
	DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q \| grep "Product Name" \| head -n 1 \| awk -F ':' '{print $2}' \| sed 's/^ *//')
	elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
	DEVICE_TYPE="$(lscpu \| grep "Model name" \| sed -E 's/.Model name:[[:space:]]//; s/Intel$R$//g; s/$R$//g; s/$TM$//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core$s$ per socket/ {c=$2} /Socket$s$/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))"
	elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
	DEVICE_TYPE=$(lscpu \| grep 'Vendor ID' \| cut -f 2 -d ":" \| awk '{$1=$1}1' \| cut -f 2 -d " ")
	fi
	echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV

	- name: Install dependencies
	shell: bash
	run: \|
	set -eux

	if [[ "${DEVICE_NAME}" == "rocm" ]]; then
	pip install -r .github/scripts/requirements.txt \
	--extra-index-url https://download.pytorch.org/whl/rocm6.3
	elif [[ "${DEVICE_NAME}" == "hpu" ]] \|\| [[ "${DEVICE_NAME}" == cpu ]]; then
	grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt
	pip install -r /tmp/requirements_no_torch.txt
	else
	pip install -r .github/scripts/requirements.txt \
	--extra-index-url https://download.pytorch.org/whl/cu128
	fi

	- name: Set Docker registry
	shell: bash
	env:
	HEAD_BRANCH: ${{ inputs.vllm_branch \|\| 'main' }}
	run: \|
	set -eux

	# Mimic the logic from vllm ci-infra test template
	if [[ "${HEAD_BRANCH}" == "main" ]]; then
	DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
	else
	DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo
	fi

	DOCKER_IMAGE_SUFFIX=""
	if [[ "${DEVICE_NAME}" == "rocm" ]]; then
	DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
	elif [[ "${DEVICE_NAME}" == "hpu" ]]; then
	DOCKER_IMAGE_SUFFIX=-hpu
	elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
	DOCKER_IMAGE_SUFFIX=-cpu
	elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
	DOCKER_IMAGE_SUFFIX=-arm64-cpu
	fi
	echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
	echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV

	- name: Authenticate with AWS
	# Only need for DGX hosts
	if: contains(env.DEVICE_TYPE, 'B200')
	uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
	with:
	role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only
	role-duration-seconds: 18000
	aws-region: us-east-1

	- name: Login to public.ecr.aws
	# Only need for DGX hosts
	if: contains(env.DEVICE_TYPE, 'B200')
	uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
	with:
	registry-type: public

	- name: Clean up unused Docker images
	if: env.DEVICE_NAME == 'hpu' \|\| contains(matrix.runner, 'gnr')
	run: \|
	set -eux
	docker image prune -a -f \|\| true

	- name: Check for last benchmark commit
	working-directory: vllm-benchmarks
	env:
	HEAD_BRANCH: ${{ inputs.vllm_branch \|\| 'main' }}
	HEAD_SHA: ${{ inputs.vllm_commit \|\| '' }}
	MODELS: ${{ matrix.models }}
	run: \|
	set -eux

	if [[ -z "${HEAD_SHA}" ]]; then
	pushd vllm
	# Looking back the latest 100 commits is enough
	for i in {0..99}
	do
	# Check if the image is there, if it doesn't then check an older one
	# because the commit is too recent
	HEAD_SHA=$(git rev-parse --verify HEAD~${i})
	DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
	# No Docker image available yet because the commit is too recent
	if ! docker manifest inspect "${DOCKER_IMAGE}"; then
	continue
	fi
	NOT_EXIST=0
	S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json"
	aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} \|\| NOT_EXIST=1
	if [[ ${NOT_EXIST} == "1" ]]; then
	echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet"
	break
	fi
	done
	popd
	fi

	echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV

	# Print the benchmark commit for reference
	echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}"

	- name: Setup CUDA GPU_FLAG for docker run
	if: env.DEVICE_NAME == 'cuda'
	run: \|
	echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"

	- name: Setup ROCm
	if: env.DEVICE_NAME == 'rocm'
	uses: pytorch/pytorch/./.github/actions/setup-rocm@main

	- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
	run: \|
	echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"

	- name: Setup benchmark tests
	env:
	MODELS: ${{ matrix.models }}
	run: \|
	set -eux

	pushd vllm-benchmarks/vllm
	git checkout "${HEAD_SHA}"
	rm .buildkite/performance-benchmarks/tests/*.json \|\| true
	popd

	# Set the list of benchmarks we want to cover in this runner
	python3 .github/scripts/setup_vllm_benchmark.py \
	--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
	--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/performance-benchmarks/tests \
	--models "${MODELS}" \
	--device "${DEVICE_NAME}" \
	--include-eager-mode

	pushd vllm-benchmarks/vllm
	ls -lah .buildkite/performance-benchmarks/tests
	find .buildkite/performance-benchmarks/tests -type f -exec cat {} \;
	popd

	- name: Run vLLM benchmark
	env:
	SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
	SCCACHE_REGION: us-east-1
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	HF_HOME: /mnt/hf_cache
	FLASHINFER_WORKSPACE_BASE: /mnt/hf_cache
	TRANSFORMERS_OFFLINE: ${{ github.event_name == 'workflow_dispatch' && inputs.hf_offline == false && '0' \|\| '1' }}
	DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
	# vLLM-related environment variables
	ENGINE_VERSION: v1
	SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
	run: \|
	set -eux

	ON_CPU=0

	if [[ "${DEVICE_NAME}" == "cpu" ]] \|\| [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
	ON_CPU=1
	fi

	# Just create an empty HF_CACHE dir if it doesn't exist
	if [[ ! -d "${HF_HOME}" ]]; then
	export HF_HOME="${RUNNER_TEMP}/hf_cache"
	export FLASHINFER_WORKSPACE_BASE="${HF_HOME}"
	mkdir -p "${HF_HOME}"

	# When there is no cache directory, e.g. benchmark, the job has no
	# way but to reach out to HF if needed
	export TRANSFORMERS_OFFLINE=0
	fi

	container_name=$(docker run \
	${GPU_FLAG:-} \
	${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
	-e SCCACHE_BUCKET \
	-e SCCACHE_REGION \
	-e DEVICE_NAME \
	-e DEVICE_TYPE \
	-e HF_TOKEN \
	-e HF_HOME \
	-e FLASHINFER_WORKSPACE_BASE \
	-e TRANSFORMERS_OFFLINE \
	-e ENGINE_VERSION \
	-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
	-e ON_CPU="${ON_CPU}" \
	--ipc=host \
	--tty \
	--detach \
	--security-opt seccomp=unconfined \
	--shm-size=4g \
	-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
	-v "${HF_HOME}:${HF_HOME}" \
	-w /tmp/workspace \
	"${DOCKER_IMAGE}"
	)
	if [[ "${DEVICE_NAME}" == "cuda" ]]; then
	docker exec -t "${container_name}" bash -c "
	pip install torchao==0.16.0 fbgemm-gpu-genai==1.5.0

	# A quick mitigation for https://github.com/vllm-project/vllm/issues/32373
	rm /etc/ld.so.conf.d/cuda-compat.conf \|\| true
	ldconfig
	"
	fi
	docker exec -t "${container_name}" bash -c "
	cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
	"

	- name: Authenticate with AWS
	# AWS CUDA runners already have access to the bucket via its runner IAM role
	if: env.DEVICE_NAME == 'rocm' \|\| env.DEVICE_NAME == 'hpu' \|\| contains(env.DEVICE_TYPE, 'B200') \|\| contains(matrix.runner, 'gnr')
	uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
	with:
	role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
	# The max duration enforced by the server side
	role-duration-seconds: 18000
	aws-region: us-east-1

	- name: Check the benchmark results
	env:
	BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
	run: \|
	set -eux

	sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
	ls -lah "${BENCHMARK_RESULTS}"

	# Fail when there is no result or if the metrics are all zero
	python3 .github/scripts/check_benchmark_results.py \
	--benchmark-results "${BENCHMARK_RESULTS}" \
	--strict

	- name: Upload the benchmark results
	env:
	BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results
	MODELS: ${{ matrix.models }}
	WORKFLOW_RUN_ID: ${{ github.run_id }}
	RUN_ATTEMPT: ${{ github.run_attempt }}
	JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
	RUN_LOCAL_BRANCH: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref \|\| github.ref }}
	run: \|
	set -eux

	echo "## workflow info: ${WORKFLOW_RUN_ID} ${RUN_ATTEMPT} ${JOB_ID}"

	SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" \| sed "s/[^[:alnum:].-]/_/g")
	SANITIZED_MODELS="${MODELS//\//_}"

	python3 .github/scripts/upload_benchmark_results.py \
	--repo vllm-benchmarks/vllm \
	--benchmark-name "vLLM benchmark" \
	--benchmark-results "${BENCHMARK_RESULTS}" \
	--device-name "${DEVICE_NAME}" \
	--device-type "${SANITIZED_DEVICE_TYPE}" \
	--model "${SANITIZED_MODELS}"

	echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV
	echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV

	# Keep a copy of the benchmark results on GitHub for reference
	- uses: actions/upload-artifact@v4
	with:
	name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }}
	path: vllm-benchmarks/vllm/benchmarks/results

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

vLLM Benchmark #1625

Workflow file

vLLM Benchmark #1625

Uh oh!

Workflow file for this run