vLLM Benchmark #1625
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: vLLM Benchmark | |
| on: | |
| schedule: | |
| # Run daily at 1:15 AM PST | |
| - cron: '15 9 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| vllm_branch: | |
| description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request) | |
| required: true | |
| type: string | |
| default: main | |
| vllm_commit: | |
| description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked) | |
| required: false | |
| type: string | |
| models: | |
| description: | | |
| A comma-separated list of models from vllm-benchmarks/benchmarks (optional, default to run everything) | |
| required: false | |
| type: string | |
| runners: | |
| description: | | |
| A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything) | |
| required: true | |
| type: string | |
| default: rocm,spr,gnr,gaudi3,m8g | |
| hf_offline: | |
| description: Run with HuggingFace offline mode (set TRANSFORMERS_OFFLINE=1) | |
| required: false | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} | |
| cancel-in-progress: true | |
| jobs: | |
| set-parameters: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| benchmark_matrix: ${{ steps.set-parameters.outputs.benchmark_matrix }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.12' | |
| - name: Set parameters | |
| id: set-parameters | |
| shell: bash | |
| env: | |
| MODELS: ${{ inputs.models || '' }} | |
| RUNNERS: ${{ inputs.runners || '' }} | |
| run: | | |
| set -eux | |
| # The generated matrix is grouped by model and runner | |
| python .github/scripts/generate_vllm_benchmark_matrix.py \ | |
| --benchmark-configs-dir vllm-benchmarks/benchmarks \ | |
| --models "${MODELS}" \ | |
| --runners "${RUNNERS}" | |
| benchmarks: | |
| name: Run vLLM benchmarks | |
| needs: set-parameters | |
| if: ${{ !github.event.pull_request.head.repo.fork && github.repository_owner == 'pytorch' }} | |
| strategy: | |
| matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_matrix) }} | |
| fail-fast: false | |
| runs-on: ${{ matrix.runner }} | |
| environment: pytorch-x-vllm | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - name: Fix workspace permissions | |
| run: | | |
| sudo -n chown -R $USER:$USER $GITHUB_WORKSPACE || true | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Checkout vLLM repository | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: vllm-project/vllm | |
| path: vllm-benchmarks/vllm | |
| ref: ${{ inputs.vllm_branch || 'main' }} | |
| fetch-depth: 0 | |
| - uses: actions/setup-python@v5 | |
| # Amazon Linux fails on this step | |
| continue-on-error: true | |
| with: | |
| python-version: '3.12' | |
| cache: 'pip' | |
| - name: Get workflow job id | |
| id: get-job-id | |
| uses: pytorch/test-infra/.github/actions/get-workflow-job-id@36562d6c43fa914f7bdef67ce23e5c31f1387b2e | |
| with: | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Check if the device is supported | |
| shell: bash | |
| run: | | |
| set -eux | |
| if command -v nvidia-smi; then | |
| DEVICE_NAME=cuda | |
| nvidia-smi | |
| elif command -v rocm-smi; then | |
| DEVICE_NAME=rocm | |
| rocm-smi | |
| elif command -v hl-smi; then | |
| DEVICE_NAME=hpu | |
| hl-smi | |
| else | |
| arch=$(uname -m) | |
| case "$arch" in | |
| aarch64|arm64) | |
| DEVICE_NAME=arm64-cpu | |
| ;; | |
| *) | |
| DEVICE_NAME=cpu | |
| ;; | |
| esac | |
| lscpu | |
| fi | |
| echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV | |
| - name: Set GPU name and type | |
| working-directory: vllm-benchmarks | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "cuda" ]]; then | |
| # Return the same device name as PyTorch | |
| DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader) | |
| elif [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) | |
| elif [[ "${DEVICE_NAME}" == "hpu" ]]; then | |
| DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') | |
| elif [[ "${DEVICE_NAME}" == "cpu" ]]; then | |
| DEVICE_TYPE="$(lscpu | grep "Model name" | sed -E 's/.*Model name:[[:space:]]*//; s/Intel\(R\)//g; s/\(R\)//g; s/\(TM\)//g; s/CPU//g; s/Processor//g; s/[[:space:]]+/ /g; s/^ //; s/ $//; s/ /_/g')_$(awk -F: '/Core\(s\) per socket/ {c=$2} /Socket\(s\)/ {s=$2} END {gsub(/ /,"",c); gsub(/ /,"",s); printf "%sc", c*s}' < <(lscpu))" | |
| elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then | |
| DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") | |
| fi | |
| echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV | |
| - name: Install dependencies | |
| shell: bash | |
| run: | | |
| set -eux | |
| if [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/rocm6.3 | |
| elif [[ "${DEVICE_NAME}" == "hpu" ]] || [[ "${DEVICE_NAME}" == *cpu* ]]; then | |
| grep -v "^torch==" .github/scripts/requirements.txt > /tmp/requirements_no_torch.txt | |
| pip install -r /tmp/requirements_no_torch.txt | |
| else | |
| pip install -r .github/scripts/requirements.txt \ | |
| --extra-index-url https://download.pytorch.org/whl/cu128 | |
| fi | |
| - name: Set Docker registry | |
| shell: bash | |
| env: | |
| HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} | |
| run: | | |
| set -eux | |
| # Mimic the logic from vllm ci-infra test template | |
| if [[ "${HEAD_BRANCH}" == "main" ]]; then | |
| DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo | |
| else | |
| DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo | |
| fi | |
| DOCKER_IMAGE_SUFFIX="" | |
| if [[ "${DEVICE_NAME}" == "rocm" ]]; then | |
| DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci | |
| elif [[ "${DEVICE_NAME}" == "hpu" ]]; then | |
| DOCKER_IMAGE_SUFFIX=-hpu | |
| elif [[ "${DEVICE_NAME}" == "cpu" ]]; then | |
| DOCKER_IMAGE_SUFFIX=-cpu | |
| elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then | |
| DOCKER_IMAGE_SUFFIX=-arm64-cpu | |
| fi | |
| echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV | |
| echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV | |
| - name: Authenticate with AWS | |
| # Only need for DGX hosts | |
| if: contains(env.DEVICE_TYPE, 'B200') | |
| uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/public_ecr_read_only | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Login to public.ecr.aws | |
| # Only need for DGX hosts | |
| if: contains(env.DEVICE_TYPE, 'B200') | |
| uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1 | |
| with: | |
| registry-type: public | |
| - name: Clean up unused Docker images | |
| if: env.DEVICE_NAME == 'hpu' || contains(matrix.runner, 'gnr') | |
| run: | | |
| set -eux | |
| docker image prune -a -f || true | |
| - name: Check for last benchmark commit | |
| working-directory: vllm-benchmarks | |
| env: | |
| HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }} | |
| HEAD_SHA: ${{ inputs.vllm_commit || '' }} | |
| MODELS: ${{ matrix.models }} | |
| run: | | |
| set -eux | |
| if [[ -z "${HEAD_SHA}" ]]; then | |
| pushd vllm | |
| # Looking back the latest 100 commits is enough | |
| for i in {0..99} | |
| do | |
| # Check if the image is there, if it doesn't then check an older one | |
| # because the commit is too recent | |
| HEAD_SHA=$(git rev-parse --verify HEAD~${i}) | |
| DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" | |
| # No Docker image available yet because the commit is too recent | |
| if ! docker manifest inspect "${DOCKER_IMAGE}"; then | |
| continue | |
| fi | |
| NOT_EXIST=0 | |
| S3_PATH="v3/vllm-project/vllm/${HEAD_BRANCH}/${HEAD_SHA}/${DEVICE_TYPE// /_}/benchmark_results_${MODELS//\//_}.json" | |
| aws s3api head-object --bucket ossci-benchmarks --key ${S3_PATH} || NOT_EXIST=1 | |
| if [[ ${NOT_EXIST} == "1" ]]; then | |
| echo "Found a vLLM commit ${HEAD_SHA} that hasn't been benchmarked yet" | |
| break | |
| fi | |
| done | |
| popd | |
| fi | |
| echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV | |
| # Print the benchmark commit for reference | |
| echo "### Run benchmark on [${HEAD_SHA}](https://github.com/vllm-project/vllm/commit/${HEAD_SHA})" >> "${GITHUB_STEP_SUMMARY}" | |
| - name: Setup CUDA GPU_FLAG for docker run | |
| if: env.DEVICE_NAME == 'cuda' | |
| run: | | |
| echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}" | |
| - name: Setup ROCm | |
| if: env.DEVICE_NAME == 'rocm' | |
| uses: pytorch/pytorch/./.github/actions/setup-rocm@main | |
| - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container | |
| run: | | |
| echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}" | |
| - name: Setup benchmark tests | |
| env: | |
| MODELS: ${{ matrix.models }} | |
| run: | | |
| set -eux | |
| pushd vllm-benchmarks/vllm | |
| git checkout "${HEAD_SHA}" | |
| rm .buildkite/performance-benchmarks/tests/*.json || true | |
| popd | |
| # Set the list of benchmarks we want to cover in this runner | |
| python3 .github/scripts/setup_vllm_benchmark.py \ | |
| --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ | |
| --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/performance-benchmarks/tests \ | |
| --models "${MODELS}" \ | |
| --device "${DEVICE_NAME}" \ | |
| --include-eager-mode | |
| pushd vllm-benchmarks/vllm | |
| ls -lah .buildkite/performance-benchmarks/tests | |
| find .buildkite/performance-benchmarks/tests -type f -exec cat {} \; | |
| popd | |
| - name: Run vLLM benchmark | |
| env: | |
| SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 | |
| SCCACHE_REGION: us-east-1 | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| HF_HOME: /mnt/hf_cache | |
| FLASHINFER_WORKSPACE_BASE: /mnt/hf_cache | |
| TRANSFORMERS_OFFLINE: ${{ github.event_name == 'workflow_dispatch' && inputs.hf_offline == false && '0' || '1' }} | |
| DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} | |
| # vLLM-related environment variables | |
| ENGINE_VERSION: v1 | |
| SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 | |
| run: | | |
| set -eux | |
| ON_CPU=0 | |
| if [[ "${DEVICE_NAME}" == "cpu" ]] || [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then | |
| ON_CPU=1 | |
| fi | |
| # Just create an empty HF_CACHE dir if it doesn't exist | |
| if [[ ! -d "${HF_HOME}" ]]; then | |
| export HF_HOME="${RUNNER_TEMP}/hf_cache" | |
| export FLASHINFER_WORKSPACE_BASE="${HF_HOME}" | |
| mkdir -p "${HF_HOME}" | |
| # When there is no cache directory, e.g. benchmark, the job has no | |
| # way but to reach out to HF if needed | |
| export TRANSFORMERS_OFFLINE=0 | |
| fi | |
| container_name=$(docker run \ | |
| ${GPU_FLAG:-} \ | |
| ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ | |
| -e SCCACHE_BUCKET \ | |
| -e SCCACHE_REGION \ | |
| -e DEVICE_NAME \ | |
| -e DEVICE_TYPE \ | |
| -e HF_TOKEN \ | |
| -e HF_HOME \ | |
| -e FLASHINFER_WORKSPACE_BASE \ | |
| -e TRANSFORMERS_OFFLINE \ | |
| -e ENGINE_VERSION \ | |
| -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ | |
| -e ON_CPU="${ON_CPU}" \ | |
| --ipc=host \ | |
| --tty \ | |
| --detach \ | |
| --security-opt seccomp=unconfined \ | |
| --shm-size=4g \ | |
| -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ | |
| -v "${HF_HOME}:${HF_HOME}" \ | |
| -w /tmp/workspace \ | |
| "${DOCKER_IMAGE}" | |
| ) | |
| if [[ "${DEVICE_NAME}" == "cuda" ]]; then | |
| docker exec -t "${container_name}" bash -c " | |
| pip install torchao==0.16.0 fbgemm-gpu-genai==1.5.0 | |
| # A quick mitigation for https://github.com/vllm-project/vllm/issues/32373 | |
| rm /etc/ld.so.conf.d/cuda-compat.conf || true | |
| ldconfig | |
| " | |
| fi | |
| docker exec -t "${container_name}" bash -c " | |
| cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh | |
| " | |
| - name: Authenticate with AWS | |
| # AWS CUDA runners already have access to the bucket via its runner IAM role | |
| if: env.DEVICE_NAME == 'rocm' || env.DEVICE_NAME == 'hpu' || contains(env.DEVICE_TYPE, 'B200') || contains(matrix.runner, 'gnr') | |
| uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0 | |
| with: | |
| role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results | |
| # The max duration enforced by the server side | |
| role-duration-seconds: 18000 | |
| aws-region: us-east-1 | |
| - name: Check the benchmark results | |
| env: | |
| BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results | |
| run: | | |
| set -eux | |
| sudo chown -R ${UID} "${BENCHMARK_RESULTS}" | |
| ls -lah "${BENCHMARK_RESULTS}" | |
| # Fail when there is no result or if the metrics are all zero | |
| python3 .github/scripts/check_benchmark_results.py \ | |
| --benchmark-results "${BENCHMARK_RESULTS}" \ | |
| --strict | |
| - name: Upload the benchmark results | |
| env: | |
| BENCHMARK_RESULTS: vllm-benchmarks/vllm/benchmarks/results | |
| MODELS: ${{ matrix.models }} | |
| WORKFLOW_RUN_ID: ${{ github.run_id }} | |
| RUN_ATTEMPT: ${{ github.run_attempt }} | |
| JOB_ID: ${{ steps.get-job-id.outputs.job-id }} | |
| RUN_LOCAL_BRANCH: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.ref || github.ref }} | |
| run: | | |
| set -eux | |
| echo "## workflow info: ${WORKFLOW_RUN_ID} ${RUN_ATTEMPT} ${JOB_ID}" | |
| SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alnum:].-]/_/g") | |
| SANITIZED_MODELS="${MODELS//\//_}" | |
| python3 .github/scripts/upload_benchmark_results.py \ | |
| --repo vllm-benchmarks/vllm \ | |
| --benchmark-name "vLLM benchmark" \ | |
| --benchmark-results "${BENCHMARK_RESULTS}" \ | |
| --device-name "${DEVICE_NAME}" \ | |
| --device-type "${SANITIZED_DEVICE_TYPE}" \ | |
| --model "${SANITIZED_MODELS}" | |
| echo "SANITIZED_DEVICE_TYPE=$SANITIZED_DEVICE_TYPE" >> $GITHUB_ENV | |
| echo "SANITIZED_MODELS=$SANITIZED_MODELS" >> $GITHUB_ENV | |
| # Keep a copy of the benchmark results on GitHub for reference | |
| - uses: actions/upload-artifact@v4 | |
| with: | |
| name: benchmark-results--${{ env.SANITIZED_DEVICE_TYPE }}-${{ env.SANITIZED_MODELS }} | |
| path: vllm-benchmarks/vllm/benchmarks/results |