Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ remove_docker_container

# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2

function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3

# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
Expand All @@ -36,6 +37,7 @@ function cpu_tests() {
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
Expand Down Expand Up @@ -85,4 +87,4 @@ function cpu_tests() {

# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
2 changes: 1 addition & 1 deletion Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li

RUN echo 'ulimit -c 0' >> ~/.bashrc

RUN pip install intel_extension_for_pytorch==2.5.0
RUN pip install intel_extension_for_pytorch==2.6.0

WORKDIR /workspace

Expand Down
2 changes: 1 addition & 1 deletion cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.6
GIT_TAG v3.7.1
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
Expand Down
2 changes: 1 addition & 1 deletion requirements/cpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
-r common.txt

# Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
torch==2.6.0+cpu; platform_machine == "x86_64"
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torch==2.7.0.dev20250304; platform_machine == "s390x"

Expand Down
2 changes: 1 addition & 1 deletion tests/lora/test_qwen2vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.platforms import current_platform


@pytest.fixture(autouse=True)
@pytest.fixture(autouse=not current_platform.is_cpu())
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
Expand Down
2 changes: 1 addition & 1 deletion vllm/attention/ops/ipex_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class _PagedAttention:

@staticmethod
def get_supported_head_sizes() -> List[int]:
return [32, 64, 80, 96, 112, 128, 256]
return [32, 64, 80, 96, 112, 128, 192, 256]

@staticmethod
def get_kv_cache_shape(
Expand Down
9 changes: 5 additions & 4 deletions vllm/executor/multiproc_worker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,11 @@ def _run_worker_process(
# online (in situ) tuning is enabled.
# Offline tuning API (record_untuned_is_enabled()) only
# available in PyTorch 2.6 or later.
import torch.cuda.tunable as tunable
if (tunable.is_enabled() and tunable.tuning_is_enabled()
and not tunable.record_untuned_is_enabled()):
tunable.write_file()
if torch.cuda.is_available():
import torch.cuda.tunable as tunable
if (tunable.is_enabled() and tunable.tuning_is_enabled()
and not tunable.record_untuned_is_enabled()):
tunable.write_file()

logger.info("Worker exiting")

Expand Down
6 changes: 5 additions & 1 deletion vllm/model_executor/layers/fused_moe/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,11 @@ def forward_cpu(
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
activation: str = "silu",
**kwargs,
):
assert custom_routing_function is None
assert activation == "silu", f"{activation} is not supported."
return layer.ipex_fusion(
x,
Expand All @@ -206,6 +207,9 @@ def forward_cpu(
renormalize,
topk_group,
num_expert_group,
custom_routing_function,
scoring_func,
e_score_correction_bias,
)

def forward_tpu(
Expand Down
3 changes: 3 additions & 0 deletions vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
# Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

# MLA attention is not supported
os.environ["VLLM_MLA_DISABLE"] = "1"

# Intel OpenMP setting
ld_prealod_str = os.getenv("LD_PRELOAD", "")
if "libiomp5.so" in ld_prealod_str:
Expand Down