Test

[WIP] new scheduler policy based on tkv shift ratio #3768

Workflow file for this run

	name: Test

	on:
	# Don't use `paths` or `paths-ignore` filter since this workflow is required
	# for all pull requests on main irrespective of file type or location
	# Use `changed-src-files` step to determine if source code was changed
	pull_request:
	# add labeled and unlabeled to the default types (runs when label is added)
	types: [opened, synchronize, reopened, labeled, unlabeled, auto_merge_enabled]
	branches: [main]

	push:
	branches: [main]

	workflow_dispatch:

	env:
	FORCE_COLOR: "1"
	VLLM_CPU_DISABLE_AVX512: "true"
	VLLM_TARGET_DEVICE: "empty"
	VLLM_PLUGINS: "sendnn_inference"
	HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub"
	DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b"
	DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f"

	concurrency:
	group: ${{ github.workflow }}-${{ github.head_ref \|\| github.run_id }}
	cancel-in-progress: true

	jobs:
	test:
	timeout-minutes: 20
	runs-on: ${{ matrix.os }}
	strategy:
	fail-fast: false
	matrix:
	os: ["ubuntu-latest"]
	python_version: ["3.12"]
	vllm_version:
	- name: "default"
	repo: ""
	- name: "vLLM:main"
	repo: "git+https://github.com/vllm-project/vllm --branch main"
	test_suite:
	- name: "chunked prefill"
	markers: "cpu and decoder and not prefix_caching and not quantized and not multimodal"
	- name: "prefix caching"
	markers: "cpu and decoder and prefix_caching and not quantized"
	- name: "fp8"
	markers: "cpu and quantized and multi"
	flags: "--timeout=600 -k 'basic and test_output' --durations=0"
	hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8"
	hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e"
	- name: "embedding"
	markers: "cpu and embedding and not quantized"
	flags: "--timeout=300"
	hf_model: "sentence-transformers/all-roberta-large-v1"
	hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
	- name: "scoring"
	markers: "cpu and scoring"
	flags: "--timeout=300"
	hf_model: "cross-encoder/stsb-roberta-large"
	hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98"
	- name: "worker and utils"
	markers: "not e2e and not quantized and not spyre and not multimodal"
	flags: "--timeout=300"
	- name: "multimodal"
	markers: "cpu and multimodal"
	flags: "--timeout=300 -sv"
	# hf_model: "ibm-granite/granite-vision-3.2-2b"
	# hf_model_rev: "2818ae5b93cb750b099df1b65f7864e4a0401271"
	env_overrides: "HF_HUB_OFFLINE=0"
	include:
	# Lower bound support
	- vllm_version:
	name: "vLLM:lowest"
	repo: "git+https://github.com/vllm-project/vllm --tag v0.19.0"
	test_suite:
	name: "backward compat"
	markers: "compat or (cpu and basic and not quantized)"
	flags: "--timeout=300"
	hf_model_2: "sentence-transformers/all-roberta-large-v1"
	hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
	os: "ubuntu-latest"
	python_version: "3.12"
	# # Intermediate versions of vllm to check basic support for as well
	# - vllm_version:
	# name: "vLLM:0.14.0"
	# repo: "git+https://github.com/vllm-project/vllm --tag v0.14.0"
	# test_suite:
	# name: "backward compat"
	# markers: "cpu and basic and not quantized"
	# flags: "--timeout=300"
	# hf_model_2: "sentence-transformers/all-roberta-large-v1"
	# hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c"
	# os: "ubuntu-latest"
	# python_version: "3.12"

	# Only run vllm:main jobs on PRs with `vllm:main` label
	exclude: >-
	${{
	(
	github.event_name != 'pull_request' \|\|
	!(contains(toJson(github.event.pull_request.labels), '"vllm:main"'))
	) && fromJSON('[{"vllm_version":{"name":"vLLM:main"}}]')
	\|\| fromJSON('[]')
	}}


	name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"

	steps:
	- name: "Lightweight disk cleanup"
	# super lightweight cleanup, not nearly as much as actions/free-up-disk-space
	shell: bash
	run: \|
	rm -rf /usr/share/swift
	rm -rf /user/local/share/chromium
	sudo rm -rf /usr/local/share/powershell

	- name: "Checkout"
	uses: actions/checkout@v4
	with:
	fetch-depth: 1

	- name: "Get changed source files"
	id: changed-src-files
	uses: tj-actions/changed-files@v46
	with: # Avoid using single or double quotes for multiline patterns
	files: \|
	.github/workflows/test.yml
	pyproject.toml
	uv.lock
	tests/*/.py
	sendnn_inference/*/.py
	sendnn_inference/config/model_configs.yaml

	- name: "Install uv"
	if: steps.changed-src-files.outputs.any_changed == 'true'
	uses: astral-sh/setup-uv@v5
	with:
	version: "latest"
	python-version: ${{ matrix.python_version }}
	enable-cache: true
	ignore-nothing-to-cache: true
	cache-dependency-glob: \|
	pyproject.toml

	- name: "Set vLLM version"
	if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.vllm_version.repo )
	run: \|
	uv add ${{ matrix.vllm_version.repo }}
	echo "TEST_VLLM_VERSION=${{ matrix.vllm_version.name }}" >> "$GITHUB_ENV"

	- name: "Install vLLM with Spyre plugin"
	if: steps.changed-src-files.outputs.any_changed == 'true'
	run: \|
	uv venv .venv --system-site-packages --clear
	source .venv/bin/activate

	# Syncs both the runtime and dev deps, based on the lockfile contents
	uv sync --frozen
	# Builds and installs the sendnn-inference wheel into .venv
	# This needs to be done after `uv sync`, or the wheel install will be
	# overwritten.
	uv pip install -v .

	- name: "Standardize HF model names for caching"
	id: standardize-names
	if: steps.changed-src-files.outputs.any_changed == 'true'
	run: \|
	# replace '/' characters in HF_MODEL with '--' for GHA cache keys and
	# in model file names in local HF hub cache

	# don't use in-line default values for variable expansion here to not
	# use the default model revision with a non-default model like this:
	# model="${{ matrix.test_suite.hf_model \|\| env.DEFAULT_HF_MODEL }}"
	# revision="${{ matrix.test_suite.hf_model_rev \|\| env.DEFAULT_HF_MODEL_REV }}"

	if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
	model="${{ matrix.test_suite.hf_model }}"
	revision="${{ matrix.test_suite.hf_model_rev }}"
	else
	model="${{ env.DEFAULT_HF_MODEL }}"
	revision="${{ env.DEFAULT_HF_MODEL_REV }}"
	fi
	safe_name="${model//\//--}"
	echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV"
	echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV"

	if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then
	model_2="${{ matrix.test_suite.hf_model_2 }}"
	revision_2="${{ matrix.test_suite.hf_model_2_rev}}"
	safe_name_2="${model_2//\//--}"
	echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV"
	echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV"
	fi

	- name: "Restore HF models cache"
	id: cache_restore
	if: steps.changed-src-files.outputs.any_changed == 'true'
	uses: actions/cache/restore@v4
	with:
	path: ${{ env.model_path }}
	key: ${{ runner.os }}-hf-model-${{ env.model_key }}

	- name: "Restore HF models cache for additional model"
	id: cache_restore_2
	if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 )
	uses: actions/cache/restore@v4
	with:
	path: ${{ env.model_2_path }}
	key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

	- name: "Download HF models"
	if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' \|\| steps.cache_restore_2.outputs.cache-hit != 'true'))
	run: \|
	# We are caching HF models (HF_HUB_CACHE) for reliability rather than
	# speed, since HF downloads are flaky for concurrent jobs.
	# Be careful when adding models to the cache here, as the GHA cache is
	# limited to 10 GB.
	# If a new model is added here, a new hash key is generated. The
	# previous cache blob can then be removed by an admin or can be left
	# to expire after 7 days.

	if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then
	model="${{ matrix.test_suite.hf_model }}"
	revision="${{ matrix.test_suite.hf_model_rev }}"
	else
	model="${{ env.DEFAULT_HF_MODEL }}"
	revision="${{ env.DEFAULT_HF_MODEL_REV }}"
	fi
	model_2="${{ matrix.test_suite.hf_model_2 }}"
	revision_2="${{ matrix.test_suite.hf_model_2_rev }}"

	python3 tools/download_model.py -m "$model" -r "${revision:-main}" &

	if [[ -n "$model_2" ]]; then
	python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" &
	fi

	wait

	- name: "Save HF models cache"
	if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' )
	uses: actions/cache/save@v4
	with:
	path: ${{ env.model_path }}
	key: ${{ runner.os }}-hf-model-${{ env.model_key }}

	- name: "Save HF models cache for additional model"
	if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' )
	uses: actions/cache/save@v4
	with:
	path: ${{ env.model_2_path }}
	key: ${{ runner.os }}-hf-model-${{ env.model_2_key }}

	- name: "Run tests"
	if: steps.changed-src-files.outputs.any_changed == 'true'
	env:
	MASTER_PORT: 12355
	MASTER_ADDR: localhost
	DISTRIBUTED_STRATEGY_IGNORE_MODULES: WordEmbedding
	SENDNN_INFERENCE_TEST_MODEL_LIST: ""
	HF_HUB_OFFLINE: 1
	run: \|
	# Delete the source code so we can ensure we're testing the installed
	# wheel
	rm -fr sendnn_inference
	# We activate .venv manually and run pytest directly instead of using
	# `uv run`, to avoid having `uv run` re-sync any dependencies or
	# re-install the sendnn-inference package from source
	source .venv/bin/activate

	${{matrix.test_suite.env_overrides}} python3 -m pytest ${{ matrix.test_suite.flags }} \
	tests -v -m "${{ matrix.test_suite.markers }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[WIP] new scheduler policy based on tkv shift ratio #3768

Workflow file

[WIP] new scheduler policy based on tkv shift ratio #3768

Uh oh!

Workflow file for this run