tjohnson31415
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 20 additions & 20 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 20 additions & 20 deletions
diff --git a/‎Dockerfile.openvino‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile.openvino‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/attention/attention_kernels.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/attention/attention_kernels.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/quantization/aqlm/gemm_kernels.cu‎
Lines changed: 0 additions & 2 deletions b/‎csrc/quantization/aqlm/gemm_kernels.cu‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/quantization/fp8/amd/quant_utils.cuh‎
Lines changed: 2 additions & 0 deletions b/‎csrc/quantization/fp8/amd/quant_utils.cuh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/quantization/fp8/nvidia/quant_utils.cuh‎
Lines changed: 2 additions & 0 deletions b/‎csrc/quantization/fp8/nvidia/quant_utils.cuh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/quantization/squeezellm/quant_cuda_kernel.cu‎
Lines changed: 2 additions & 1 deletion b/‎csrc/quantization/squeezellm/quant_cuda_kernel.cu‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/getting_started/openvino-installation.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/getting_started/openvino-installation.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎requirements-openvino.txt‎
Lines changed: 27 additions & 1 deletion b/‎requirements-openvino.txt‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎tests/quantization/test_fp8.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/quantization/test_fp8.py‎
Lines changed: 1 addition & 1 deletion
@@ -155,12 +155,12 @@ steps:
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+# - label: Kernels Test %N
+#   #mirror_hardwares: [amd]
+#   commands:
+#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+#   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
@@ -202,20 +202,20 @@ steps:
     - export VLLM_ATTENTION_BACKEND=XFORMERS
     - pytest -v -s spec_decode
 
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
+# - label: LoRA Test %N
+#   #mirror_hardwares: [amd]
+#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+#   parallelism: 4
+
+# - label: LoRA Long Context (Distributed)
+#   #mirror_hardwares: [amd]
+#   num_gpus: 4
+#   # This test runs llama 13B, so it is required to run on 4 GPUs.
+#   commands:
+#     # FIXIT: find out which code initialize cuda before running the test
+#     # before the fix, we need to use spawn to test it
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
 
@@ -1,7 +1,7 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
-FROM ubuntu:20.04 AS dev
+FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
     apt-get install -y python3-pip git
@@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks
 
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
@@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
 
@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
   }
   __syncthreads();
 
-  float res = 0;
-
   int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
   while (iters--) {
     if (pred && a_gl_rd < a_gl_end) {
 
@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
 
@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on
 
@@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
+      (half2*)mul.data_ptr<at::Half>(),
+      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),
 
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 
 
@@ -1,7 +1,33 @@
 # Common dependencies
--r requirements-common.txt
+# -r requirements-common.txt
+# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
+cmake >= 3.21
+ninja  # For faster builds.
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers < 4.43
+tokenizers >= 0.19.1  # Required for Llama 3.
+fastapi
+aiohttp
+openai
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
 
 # OpenVINO dependencies
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
+openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
 optimum-intel[openvino] >= 1.18.1
@@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,
Original file line number	Diff line number	Diff line change
`@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(`
`273`	`273`	`}`
`274`	`274`	`__syncthreads();`
`275`	`275`
`276`		`- float res = 0;`
`277`		`-`
`278`	`276`	`int iters = (prob_k / 8 - 1) / (8 * 32) + 1;`
`279`	`277`	`while (iters--) {`
`280`	`278`	`if (pred && a_gl_rd < a_gl_end) {`
Original file line number	Diff line number	Diff line change
`@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {`
`526`	`526`	`}`
`527`	`527`	`#endif`
`528`	`528`	`assert(false);`
	`529`	`+ return {}; // Squash missing return statement warning`
`529`	`530`	`}`
`530`	`531`
`531`	`532`	`template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>`
`@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {`
`536`	`537`	`}`
`537`	`538`	`#endif`
`538`	`539`	`assert(false);`
	`540`	`+ return {}; // Squash missing return statement warning`
`539`	`541`	`}`
`540`	`542`
`541`	`543`	`// The following macro is used to dispatch the conversion function based on`
Original file line number	Diff line number	Diff line change
`@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {`
`508`	`508`	`}`
`509`	`509`	`#endif`
`510`	`510`	`assert(false);`
	`511`	`+ return {}; // Squash missing return statement warning`
`511`	`512`	`}`
`512`	`513`
`513`	`514`	`template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>`
`@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {`
`520`	`521`	`}`
`521`	`522`	`#endif`
`522`	`523`	`assert(false);`
	`524`	`+ return {}; // Squash missing return statement warning`
`523`	`525`	`}`
`524`	`526`
`525`	`527`	`// The following macro is used to dispatch the conversion function based on`