Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 29 additions & 29 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,13 @@ steps:
- tests/standalone_tests/lazy_imports.py
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- VLLM_CI_USE_S3=1 pytest -v -s mq_llm_engine # MQLLMEngine
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove the change before merge.

- VLLM_CI_USE_S3=1 pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker
- VLLM_CI_USE_S3=1 pytest -v -s test_inputs.py
- VLLM_CI_USE_S3=1 pytest -v -s multimodal
- VLLM_CI_USE_S3=1 pytest -v -s test_utils.py # Utils
- VLLM_CI_USE_S3=1 pytest -v -s worker # Worker

- label: Python-only Installation Test
source_file_dependencies:
Expand All @@ -78,10 +78,10 @@ steps:
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- VLLM_CI_USE_S3=1 pytest -v -s basic_correctness/test_cumem.py
- VLLM_CI_USE_S3=1 pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_CI_USE_S3=1 pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_CI_USE_S3=1 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

- label: Chunked Prefill Test
source_file_dependencies:
Expand Down Expand Up @@ -112,14 +112,14 @@ steps:
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode
commands:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/test_chat_utils.py
- VLLM_CI_USE_S3=1 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

- label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests"
Expand Down Expand Up @@ -152,13 +152,13 @@ steps:
- tests/metrics
- tests/tracing
commands:
- pytest -v -s metrics
- VLLM_CI_USE_S3=1 pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing
- VLLM_CI_USE_S3=1 pytest -v -s tracing

##### fast check tests #####
##### 1 GPU test #####
Expand All @@ -183,9 +183,9 @@ steps:
- tests/test_config
- tests/test_logger
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
- VLLM_CI_USE_S3=1 pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
- pytest -v -s tokenization
- VLLM_CI_USE_S3=1 pytest -v -s tokenization

- label: V1 Test
#mirror_hardwares: [amd]
Expand Down Expand Up @@ -244,8 +244,8 @@ steps:
- tests/samplers
- tests/conftest.py
commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- VLLM_CI_USE_S3=1 pytest -v -s samplers
- VLLM_CI_USE_S3=1 VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers

- label: LogitsProcessor Test # 5min
mirror_hardwares: [amd]
Expand Down Expand Up @@ -276,7 +276,7 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
- label: PyTorch Fullgraph Smoke Test # 9min
fast_check: true
source_file_dependencies:
- vllm/
Expand All @@ -287,7 +287,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py

- label: "PyTorch Fullgraph Test" # 18min
- label: PyTorch Fullgraph Test # 18min
source_file_dependencies:
- vllm/
- tests/compile
Expand Down Expand Up @@ -328,7 +328,7 @@ steps:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
command: VLLM_CI_USE_S3=1 VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization

- label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
Expand Down Expand Up @@ -370,9 +370,9 @@ steps:
- vllm/
- tests/models
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
- VLLM_CI_USE_S3=1 pytest -v -s models/test_transformers.py
- VLLM_CI_USE_S3=1 pytest -v -s models/test_registry.py
- VLLM_CI_USE_S3=1 pytest -v -s models/test_initialization.py

- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
Expand Down
208 changes: 208 additions & 0 deletions hfs3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import shutil

import boto3
from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ModelTransfer:

def __init__(self,
model_id,
s3_bucket,
aws_access_key_id=None,
aws_secret_access_key=None,
aws_region=None):
"""
Initialize the ModelTransfer class.

Args:
model_id (str): HuggingFace model ID
s3_bucket (str): Name of the S3 bucket
aws_access_key_id (str, optional)
aws_secret_access_key (str, optional)
aws_region (str, optional): AWS region. Defaults to None.
"""
self.model_id = model_id
self.s3_bucket = s3_bucket
self.model_name = model_id.split('/')[-1]

# Initialize S3 client
self.s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=aws_region)

# Initialize Hugging Face API
self.hf_api = HfApi()

def download_model(self, local_dir):
"""
Download the model from HuggingFace.

Args:
local_dir (str): Local directory to save the model

Returns:
str: Path to the downloaded model directory
"""
logger.info("Downloading model %s...", self.model_id)

try:
local_dir_with_model = os.path.join(local_dir, self.model_name)
snapshot_download(repo_id=self.model_id,
local_dir=local_dir_with_model,
local_dir_use_symlinks=False,
token=os.getenv("HF_TOKEN"))
logger.info("Model downloaded successfully to %s",
local_dir_with_model)
return local_dir_with_model

except Exception as e:
logger.error("Error downloading model: %s", str(e))
raise

def upload_to_s3(self, local_dir):
"""
Upload the model directory to S3.

Args:
local_dir (str): Local directory containing the model files
"""
logger.info("Uploading model to S3 bucket %s...", self.s3_bucket)

try:
# Walk through all files in the directory
for root, _, files in os.walk(local_dir):
for filename in files:
# Get the full local path
local_path = os.path.join(root, filename)

# Calculate S3 path (preserve directory structure)
relative_path = os.path.relpath(local_path, local_dir)
s3_path = f"{self.model_name}/{relative_path}"

# Upload file with progress bar
file_size = os.path.getsize(local_path)
with tqdm(total=file_size,
unit='B',
unit_scale=True,
desc=f"Uploading {filename}") as pbar:
self.s3_client.upload_file(
local_path,
self.s3_bucket,
s3_path,
Callback=lambda bytes_transferred: pbar.update(
bytes_transferred))

logger.info("Uploaded %s to s3://%s/%s", filename,
self.s3_bucket, s3_path)

logger.info("Model upload completed successfully!")

except Exception as e:
logger.error("Error uploading to S3: %s", str(e))
raise


# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b",
# "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B",
# "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat",
# "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B",
# "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf",
# "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16",
# "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1",
# "mistralai/Mixtral-8x7B-Instruct-v0.1",
# "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b",
# "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf",
# "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct",
# "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat",
# "adept/persimmon-8b-chat", "microsoft/phi-2",
# "microsoft/Phi-3-mini-4k-instruct",
# "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct",
# "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat",
# "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b",
# "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b",
# "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B",
# "xverse/XVERSE-7B-Chat", "facebook/bart-base",
# "facebook/bart-large-cnn", "microsoft/Florence-2-base",
# "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2",
# "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward",
# "ai21labs/Jamba-tiny-reward-dev", "llama",
# "intfloat/e5-mistral-7b-instruct",
# "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B",
# "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach",
# "sentence-transformers/stsb-roberta-base-v2",
# "sentence-transformers/all-roberta-large-v1",
# "intfloat/multilingual-e5-large", "royokong/e5-v",
# "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1",
# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
# "cross-encoder/ms-marco-MiniLM-L-6-v2",
# "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3",
# "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny",
# "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m",
# "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3",
# "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
# "llava-hf/LLaVA-NeXT-Video-7B-hf",
# "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
# "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6",
# "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924",
# "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224",
# "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409",
# "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct",
# "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
# "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3",
# "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m",
# "ArthurZ/Ilama-3.2-1B"


def main():
# Configuration
MODEL_ID = [
"HuggingFaceH4/zephyr-7b-beta",
"llava-hf/llava-1.5-7b-hf",
"ArthurZ/Ilama-3.2-1B",
"meta-llama/Llama-2-7b-hf",
]
S3_BUCKET = "vllm-ci-model-weights"
# Local directory to temporarily store the model
LOCAL_DIR = "/home/ec2-user/models"

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = "us-west-2"

# Create transfer object
for model_id in MODEL_ID:
transfer = ModelTransfer(model_id=model_id,
s3_bucket=S3_BUCKET,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_region=AWS_REGION)

try:
# Create local directory if it doesn't exist
os.makedirs(LOCAL_DIR, exist_ok=True)

# Download model
model_dir = transfer.download_model(LOCAL_DIR)

# Upload to S3 and cleanup
transfer.upload_to_s3(model_dir)
shutil.rmtree(model_dir)

except Exception as e:
logger.error("Error in transfer process: %s", str(e))
raise


if __name__ == "__main__":
main()
26 changes: 26 additions & 0 deletions setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#! /bin/bash
# This script sets up vLLM without compilation.
# It re-install the vLLM of the given commit using the public wheel,
# and link the cloned Python source code to the installed vLLM for development.
set -e

# Check the number of arguments to be 1 or 2.
if [[ $# -ne 1 && $# -ne 2 ]]; then
echo "Usage: $0 <vLLM_PATH> <COMMIT (optional))>"
exit 1
fi

VLLM_PATH=$1
if [ -z "$2" ]; then
# If the commit is not provided, use merge-base to find the
# commmon ancestor of the current branch and main.
COMMIT=$(git merge-base main `git branch --show-current`)
else
COMMIT=$2
fi

pushd $VLLM_PATH
pip uninstall -y vllm
VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl \
pip install -e .
popd
4 changes: 2 additions & 2 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@ def test_vllm_gc_ed():


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
hf_runner,
model: str,
Expand Down
Loading
Loading