[WIP] new scheduler policy based on tkv shift ratio #3768
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test | |
| on: | |
| # Don't use `paths` or `paths-ignore` filter since this workflow is required | |
| # for all pull requests on main irrespective of file type or location | |
| # Use `changed-src-files` step to determine if source code was changed | |
| pull_request: | |
| # add labeled and unlabeled to the default types (runs when label is added) | |
| types: [opened, synchronize, reopened, labeled, unlabeled, auto_merge_enabled] | |
| branches: [main] | |
| push: | |
| branches: [main] | |
| workflow_dispatch: | |
| env: | |
| FORCE_COLOR: "1" | |
| VLLM_CPU_DISABLE_AVX512: "true" | |
| VLLM_TARGET_DEVICE: "empty" | |
| VLLM_PLUGINS: "sendnn_inference" | |
| HF_HUB_CACHE: "${{ github.workspace }}/.cache/huggingface/hub" | |
| DEFAULT_HF_MODEL: "ibm-ai-platform/micro-g3.3-8b-instruct-1b" | |
| DEFAULT_HF_MODEL_REV: "6e9c6465a9d7e5e9fa35004a29f0c90befa7d23f" | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} | |
| cancel-in-progress: true | |
| jobs: | |
| test: | |
| timeout-minutes: 20 | |
| runs-on: ${{ matrix.os }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| os: ["ubuntu-latest"] | |
| python_version: ["3.12"] | |
| vllm_version: | |
| - name: "default" | |
| repo: "" | |
| - name: "vLLM:main" | |
| repo: "git+https://github.com/vllm-project/vllm --branch main" | |
| test_suite: | |
| - name: "chunked prefill" | |
| markers: "cpu and decoder and not prefix_caching and not quantized and not multimodal" | |
| - name: "prefix caching" | |
| markers: "cpu and decoder and prefix_caching and not quantized" | |
| - name: "fp8" | |
| markers: "cpu and quantized and multi" | |
| flags: "--timeout=600 -k 'basic and test_output' --durations=0" | |
| hf_model: "ibm-ai-platform/micro-g3.3-8b-instruct-1b-FP8" | |
| hf_model_rev: "0dff8bacb968836dbbc7c2895c6d9ead0a05dc9e" | |
| - name: "embedding" | |
| markers: "cpu and embedding and not quantized" | |
| flags: "--timeout=300" | |
| hf_model: "sentence-transformers/all-roberta-large-v1" | |
| hf_model_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" | |
| - name: "scoring" | |
| markers: "cpu and scoring" | |
| flags: "--timeout=300" | |
| hf_model: "cross-encoder/stsb-roberta-large" | |
| hf_model_rev: "2b12c2c0088918e76151fd5937b7bba986ef1f98" | |
| - name: "worker and utils" | |
| markers: "not e2e and not quantized and not spyre and not multimodal" | |
| flags: "--timeout=300" | |
| - name: "multimodal" | |
| markers: "cpu and multimodal" | |
| flags: "--timeout=300 -sv" | |
| # hf_model: "ibm-granite/granite-vision-3.2-2b" | |
| # hf_model_rev: "2818ae5b93cb750b099df1b65f7864e4a0401271" | |
| env_overrides: "HF_HUB_OFFLINE=0" | |
| include: | |
| # Lower bound support | |
| - vllm_version: | |
| name: "vLLM:lowest" | |
| repo: "git+https://github.com/vllm-project/vllm --tag v0.19.0" | |
| test_suite: | |
| name: "backward compat" | |
| markers: "compat or (cpu and basic and not quantized)" | |
| flags: "--timeout=300" | |
| hf_model_2: "sentence-transformers/all-roberta-large-v1" | |
| hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" | |
| os: "ubuntu-latest" | |
| python_version: "3.12" | |
| # # Intermediate versions of vllm to check basic support for as well | |
| # - vllm_version: | |
| # name: "vLLM:0.14.0" | |
| # repo: "git+https://github.com/vllm-project/vllm --tag v0.14.0" | |
| # test_suite: | |
| # name: "backward compat" | |
| # markers: "cpu and basic and not quantized" | |
| # flags: "--timeout=300" | |
| # hf_model_2: "sentence-transformers/all-roberta-large-v1" | |
| # hf_model_2_rev: "cf74d8acd4f198de950bf004b262e6accfed5d2c" | |
| # os: "ubuntu-latest" | |
| # python_version: "3.12" | |
| # Only run vllm:main jobs on PRs with `vllm:main` label | |
| exclude: >- | |
| ${{ | |
| ( | |
| github.event_name != 'pull_request' || | |
| !(contains(toJson(github.event.pull_request.labels), '"vllm:main"')) | |
| ) && fromJSON('[{"vllm_version":{"name":"vLLM:main"}}]') | |
| || fromJSON('[]') | |
| }} | |
| name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})" | |
| steps: | |
| - name: "Lightweight disk cleanup" | |
| # super lightweight cleanup, not nearly as much as actions/free-up-disk-space | |
| shell: bash | |
| run: | | |
| rm -rf /usr/share/swift | |
| rm -rf /user/local/share/chromium | |
| sudo rm -rf /usr/local/share/powershell | |
| - name: "Checkout" | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 1 | |
| - name: "Get changed source files" | |
| id: changed-src-files | |
| uses: tj-actions/changed-files@v46 | |
| with: # Avoid using single or double quotes for multiline patterns | |
| files: | | |
| .github/workflows/test.yml | |
| pyproject.toml | |
| uv.lock | |
| tests/**/*.py | |
| sendnn_inference/**/*.py | |
| sendnn_inference/config/model_configs.yaml | |
| - name: "Install uv" | |
| if: steps.changed-src-files.outputs.any_changed == 'true' | |
| uses: astral-sh/setup-uv@v5 | |
| with: | |
| version: "latest" | |
| python-version: ${{ matrix.python_version }} | |
| enable-cache: true | |
| ignore-nothing-to-cache: true | |
| cache-dependency-glob: | | |
| pyproject.toml | |
| - name: "Set vLLM version" | |
| if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.vllm_version.repo ) | |
| run: | | |
| uv add ${{ matrix.vllm_version.repo }} | |
| echo "TEST_VLLM_VERSION=${{ matrix.vllm_version.name }}" >> "$GITHUB_ENV" | |
| - name: "Install vLLM with Spyre plugin" | |
| if: steps.changed-src-files.outputs.any_changed == 'true' | |
| run: | | |
| uv venv .venv --system-site-packages --clear | |
| source .venv/bin/activate | |
| # Syncs both the runtime and dev deps, based on the lockfile contents | |
| uv sync --frozen | |
| # Builds and installs the sendnn-inference wheel into .venv | |
| # This needs to be done after `uv sync`, or the wheel install will be | |
| # overwritten. | |
| uv pip install -v . | |
| - name: "Standardize HF model names for caching" | |
| id: standardize-names | |
| if: steps.changed-src-files.outputs.any_changed == 'true' | |
| run: | | |
| # replace '/' characters in HF_MODEL with '--' for GHA cache keys and | |
| # in model file names in local HF hub cache | |
| # don't use in-line default values for variable expansion here to not | |
| # use the default model revision with a non-default model like this: | |
| # model="${{ matrix.test_suite.hf_model || env.DEFAULT_HF_MODEL }}" | |
| # revision="${{ matrix.test_suite.hf_model_rev || env.DEFAULT_HF_MODEL_REV }}" | |
| if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | |
| model="${{ matrix.test_suite.hf_model }}" | |
| revision="${{ matrix.test_suite.hf_model_rev }}" | |
| else | |
| model="${{ env.DEFAULT_HF_MODEL }}" | |
| revision="${{ env.DEFAULT_HF_MODEL_REV }}" | |
| fi | |
| safe_name="${model//\//--}" | |
| echo "model_key=${safe_name}_${revision}" >> "$GITHUB_ENV" | |
| echo "model_path=${HF_HUB_CACHE}/models--${safe_name}" >> "$GITHUB_ENV" | |
| if [[ -n "${{ matrix.test_suite.hf_model_2 }}" ]]; then | |
| model_2="${{ matrix.test_suite.hf_model_2 }}" | |
| revision_2="${{ matrix.test_suite.hf_model_2_rev}}" | |
| safe_name_2="${model_2//\//--}" | |
| echo "model_2_key=${safe_name_2}_${revision_2}" >> "$GITHUB_ENV" | |
| echo "model_2_path=${HF_HUB_CACHE}/models--${safe_name_2}" >> "$GITHUB_ENV" | |
| fi | |
| - name: "Restore HF models cache" | |
| id: cache_restore | |
| if: steps.changed-src-files.outputs.any_changed == 'true' | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.model_path }} | |
| key: ${{ runner.os }}-hf-model-${{ env.model_key }} | |
| - name: "Restore HF models cache for additional model" | |
| id: cache_restore_2 | |
| if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 ) | |
| uses: actions/cache/restore@v4 | |
| with: | |
| path: ${{ env.model_2_path }} | |
| key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | |
| - name: "Download HF models" | |
| if: ( steps.changed-src-files.outputs.any_changed == 'true' && (steps.cache_restore.outputs.cache-hit != 'true' || steps.cache_restore_2.outputs.cache-hit != 'true')) | |
| run: | | |
| # We are caching HF models (HF_HUB_CACHE) for reliability rather than | |
| # speed, since HF downloads are flaky for concurrent jobs. | |
| # Be careful when adding models to the cache here, as the GHA cache is | |
| # limited to 10 GB. | |
| # If a new model is added here, a new hash key is generated. The | |
| # previous cache blob can then be removed by an admin or can be left | |
| # to expire after 7 days. | |
| if [[ -n "${{ matrix.test_suite.hf_model }}" ]]; then | |
| model="${{ matrix.test_suite.hf_model }}" | |
| revision="${{ matrix.test_suite.hf_model_rev }}" | |
| else | |
| model="${{ env.DEFAULT_HF_MODEL }}" | |
| revision="${{ env.DEFAULT_HF_MODEL_REV }}" | |
| fi | |
| model_2="${{ matrix.test_suite.hf_model_2 }}" | |
| revision_2="${{ matrix.test_suite.hf_model_2_rev }}" | |
| python3 tools/download_model.py -m "$model" -r "${revision:-main}" & | |
| if [[ -n "$model_2" ]]; then | |
| python3 tools/download_model.py -m "$model_2" -r "${revision_2:-main}" & | |
| fi | |
| wait | |
| - name: "Save HF models cache" | |
| if: ( steps.changed-src-files.outputs.any_changed == 'true' && github.event_name != 'pull_request' && steps.cache_restore.outputs.cache-hit != 'true' ) | |
| uses: actions/cache/save@v4 | |
| with: | |
| path: ${{ env.model_path }} | |
| key: ${{ runner.os }}-hf-model-${{ env.model_key }} | |
| - name: "Save HF models cache for additional model" | |
| if: ( steps.changed-src-files.outputs.any_changed == 'true' && matrix.test_suite.hf_model_2 && github.event_name != 'pull_request' && steps.cache_restore_2.outputs.cache-hit != 'true' ) | |
| uses: actions/cache/save@v4 | |
| with: | |
| path: ${{ env.model_2_path }} | |
| key: ${{ runner.os }}-hf-model-${{ env.model_2_key }} | |
| - name: "Run tests" | |
| if: steps.changed-src-files.outputs.any_changed == 'true' | |
| env: | |
| MASTER_PORT: 12355 | |
| MASTER_ADDR: localhost | |
| DISTRIBUTED_STRATEGY_IGNORE_MODULES: WordEmbedding | |
| SENDNN_INFERENCE_TEST_MODEL_LIST: "" | |
| HF_HUB_OFFLINE: 1 | |
| run: | | |
| # Delete the source code so we can ensure we're testing the installed | |
| # wheel | |
| rm -fr sendnn_inference | |
| # We activate .venv manually and run pytest directly instead of using | |
| # `uv run`, to avoid having `uv run` re-sync any dependencies or | |
| # re-install the sendnn-inference package from source | |
| source .venv/bin/activate | |
| ${{matrix.test_suite.env_overrides}} python3 -m pytest ${{ matrix.test_suite.flags }} \ | |
| tests -v -m "${{ matrix.test_suite.markers }}" |