Skip to content

Commit ee83917

Browse files
committed
Merge remote-tracking branch 'upstream/main' into skip-lm-head
* upstream/main: [Build] Temporarily Disable Kernels and LoRA tests (vllm-project#6961) [core][misc] improve free_finished_seq_groups (vllm-project#6865) [Kernel] Remove scaled_fp8_quant kernel padding footgun (vllm-project#6842) [Bugfix] Fix tensorizer memory profiling bug during testing (vllm-project#6881) [OpenVINO] Updated OpenVINO requirements and build docs (vllm-project#6948) [Kernel] Squash a few more warnings (vllm-project#6914) [BugFix] Fix use of per-request seed with pipeline parallel (vllm-project#6698) [Doc] Super tiny fix doc typo (vllm-project#6949)
2 parents fd11ac1 + 40c27a7 commit ee83917

36 files changed

+422
-284
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -155,12 +155,12 @@ steps:
155155
- pytest -v -s test_inputs.py
156156
- pytest -v -s multimodal
157157

158-
- label: Kernels Test %N
159-
#mirror_hardwares: [amd]
160-
commands:
161-
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
162-
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
163-
parallelism: 4
158+
# - label: Kernels Test %N
159+
# #mirror_hardwares: [amd]
160+
# commands:
161+
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
162+
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
163+
# parallelism: 4
164164

165165
- label: Models Test
166166
#mirror_hardwares: [amd]
@@ -202,20 +202,20 @@ steps:
202202
- export VLLM_ATTENTION_BACKEND=XFORMERS
203203
- pytest -v -s spec_decode
204204

205-
- label: LoRA Test %N
206-
#mirror_hardwares: [amd]
207-
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
208-
parallelism: 4
209-
210-
- label: LoRA Long Context (Distributed)
211-
#mirror_hardwares: [amd]
212-
num_gpus: 4
213-
# This test runs llama 13B, so it is required to run on 4 GPUs.
214-
commands:
215-
# FIXIT: find out which code initialize cuda before running the test
216-
# before the fix, we need to use spawn to test it
217-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
218-
- pytest -v -s -x lora/test_long_context.py
205+
# - label: LoRA Test %N
206+
# #mirror_hardwares: [amd]
207+
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
208+
# parallelism: 4
209+
210+
# - label: LoRA Long Context (Distributed)
211+
# #mirror_hardwares: [amd]
212+
# num_gpus: 4
213+
# # This test runs llama 13B, so it is required to run on 4 GPUs.
214+
# commands:
215+
# # FIXIT: find out which code initialize cuda before running the test
216+
# # before the fix, we need to use spawn to test it
217+
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn
218+
# - pytest -v -s -x lora/test_long_context.py
219219

220220
- label: Tensorizer Test
221221
#mirror_hardwares: [amd]

Dockerfile.openvino

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
22
# to run the OpenAI compatible server.
33

4-
FROM ubuntu:20.04 AS dev
4+
FROM ubuntu:22.04 AS dev
55

66
RUN apt-get update -y && \
77
apt-get install -y python3-pip git
@@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
1818
# install build requirements
1919
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
2020
# build vLLM with OpenVINO backend
21-
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
21+
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
2222

2323
COPY examples/ /workspace/vllm/examples
2424
COPY benchmarks/ /workspace/vllm/benchmarks

csrc/attention/attention_kernels.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
706706
int kv_block_stride = key_cache.stride(0);
707707
int kv_head_stride = key_cache.stride(1);
708708

709-
int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
709+
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
710710
assert(head_size % thread_group_size == 0);
711711

712712
// NOTE: alibi_slopes is optional.
@@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
865865
int kv_block_stride = key_cache.stride(0);
866866
int kv_head_stride = key_cache.stride(1);
867867

868-
int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
868+
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
869869
assert(head_size % thread_group_size == 0);
870870

871871
// NOTE: alibi_slopes is optional.

csrc/quantization/aqlm/gemm_kernels.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
273273
}
274274
__syncthreads();
275275

276-
float res = 0;
277-
278276
int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
279277
while (iters--) {
280278
if (pred && a_gl_rd < a_gl_end) {

csrc/quantization/fp8/amd/quant_utils.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
526526
}
527527
#endif
528528
assert(false);
529+
return {}; // Squash missing return statement warning
529530
}
530531

531532
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
536537
}
537538
#endif
538539
assert(false);
540+
return {}; // Squash missing return statement warning
539541
}
540542

541543
// The following macro is used to dispatch the conversion function based on

csrc/quantization/fp8/nvidia/quant_utils.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
508508
}
509509
#endif
510510
assert(false);
511+
return {}; // Squash missing return statement warning
511512
}
512513

513514
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
520521
}
521522
#endif
522523
assert(false);
524+
return {}; // Squash missing return statement warning
523525
}
524526

525527
// The following macro is used to dispatch the conversion function based on

csrc/quantization/squeezellm/quant_cuda_kernel.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
203203
#endif
204204
mat.data_ptr<int>(),
205205
#ifndef USE_ROCM
206-
(half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
206+
(half2*)mul.data_ptr<at::Half>(),
207+
(__half*)lookup_table.data_ptr<at::Half>(),
207208
#else
208209
(float2*)mul.data_ptr<float>(),
209210
(__half*)lookup_table.data_ptr<at::Half>(),

docs/source/getting_started/openvino-installation.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ Install from source
5757

5858
.. code-block:: console
5959
60-
$ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
60+
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
6161
6262
.. _openvino_backend_performance_tips:
6363

requirements-openvino.txt

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,33 @@
11
# Common dependencies
2-
-r requirements-common.txt
2+
# -r requirements-common.txt
3+
# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
4+
cmake >= 3.21
5+
ninja # For faster builds.
6+
psutil
7+
sentencepiece # Required for LLaMA tokenizer.
8+
numpy < 2.0.0
9+
requests
10+
tqdm
11+
py-cpuinfo
12+
transformers < 4.43
13+
tokenizers >= 0.19.1 # Required for Llama 3.
14+
fastapi
15+
aiohttp
16+
openai
17+
uvicorn[standard]
18+
pydantic >= 2.0 # Required for OpenAI server.
19+
pillow # Required for image processing
20+
prometheus_client >= 0.18.0
21+
prometheus-fastapi-instrumentator >= 7.0.0
22+
tiktoken >= 0.6.0 # Required for DBRX tokenizer
23+
lm-format-enforcer == 0.10.3
24+
outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
25+
typing_extensions
26+
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
27+
pyzmq
328

429
# OpenVINO dependencies
530
torch >= 2.1.2
631
openvino ~= 2024.3.0.dev
32+
openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
733
optimum-intel[openvino] >= 1.18.1

tests/quantization/test_fp8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
123123
assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
124124

125125
# Padding
126-
y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
126+
y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
127127
assert y.shape[0] == 17
128128
assert torch.allclose(
129129
ref_y,

0 commit comments

Comments
 (0)