opea-project · poussa · Mar 27, 2025 · Mar 26, 2025
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 apiVersion: v2
@@ -13,6 +13,10 @@ dependencies:
     version: 0-latest
     repository: file://../tgi
     condition: tgi.enabled
+  - name: vllm
+    version: 0-latest
+    repository: file://../vllm
+    condition: vllm.enabled
   - name: lvm-serve
     version: 0-latest
     repository: file://../lvm-serve

@@ -8,11 +8,13 @@ lvm-uservice depends on one of the following backend services:
 
 - TGI: please refer to [tgi](../tgi) chart for more information
 
+- vLLM: please refer to [vllm](../vllm) chart for more information
+
 - one of the large vision model inference engine: please refer to [lvm-serve](../lvm-serve) chart for more information
 
-First, you need to get the dependent service deployed, i.e. deploy the tgi helm chart, or lvm helm chart.
+First, you need to get the dependent service deployed, i.e. deploy the tgi helm chart, vllm helm chart or lvm helm chart.
 
-After you've deployed the dependent service successfully, please run `kubectl get svc` to get the backend service URL, e.g. `http://tgi`, `http://lvm-serve`.
+After you've deployed the dependent service successfully, please run `kubectl get svc` to get the backend service URL, e.g. `http://tgi`, `http://myvllm` or `http://lvm-serve`.
 
 To install the `lvm-uservice` chart, run the following:
 
@@ -21,10 +23,16 @@ cd GenAIInfra/helm-charts/common/lvm-uservice
 helm dependency update
 export HFTOKEN="insert-your-huggingface-token-here"
 
+# Use vLLM as the backend
+export LLM_MODEL_ID="model-id-used-for-vllm"
+export LVM_BACKEND="vLLM"
+export LVM_ENDPOINT="http://myvllm"
+helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait
+
 # Use TGI as the backend
-export LVM_BACKEND="TGI"
-export LVM_ENDPOINT="http://tgi"
-helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait
+# export LVM_BACKEND="TGI"
+# export LVM_ENDPOINT="http://tgi"
+# helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait
 
 # Use other lvm-serve engine variant as the backend, see file `values.yaml` more details
 # export LVM_ENDPOINT="http://lvm-serve"
@@ -49,9 +57,9 @@ curl http://localhost:9000/v1/lvm \
 
 ## Values
 
-| Key                             | Type   | Default | Description                                                                                       |
-| ------------------------------- | ------ | ------- | ------------------------------------------------------------------------------------------------- |
-| global.HUGGINGFACEHUB_API_TOKEN | string | `""`    | Your own Hugging Face API token                                                                   |
-| LVM_BACKEND                     | string | `"TGI"` | lvm backend engine, possible value "TGI", "LLaVA", "VideoLlama", "LlamaVision", "PredictionGuard" |
-| LVM_ENDPOINT                    | string | `""`    | LVM endpoint                                                                                      |
-| global.monitoring               | bool   | `false` | Service usage metrics                                                                             |
+| Key                             | Type   | Default  | Description                                                                                               |
+| ------------------------------- | ------ | -------- | --------------------------------------------------------------------------------------------------------- |
+| global.HUGGINGFACEHUB_API_TOKEN | string | `""`     | Your own Hugging Face API token                                                                           |
+| LVM_BACKEND                     | string | `"vLLM"` | lvm backend engine, possible value "vLLM", "TGI", "LLaVA", "VideoLlama", "LlamaVision", "PredictionGuard" |
+| LVM_ENDPOINT                    | string | `""`     | LVM endpoint                                                                                              |
+| global.monitoring               | bool   | `false`  | Service usage metrics                                                                                     |
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 LVM_BACKEND: "LLaVA"
+vllm:
+  enabled: false
 tgi:
   enabled: false
 lvm-serve:

@@ -0,0 +1,10 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+LVM_BACKEND: "TGI"
+vllm:
+  enabled: false
+tgi:
+  enabled: true
+llava:
+  enabled: false
@@ -1,5 +1,10 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-tgi:
+LVM_BACKEND: "vLLM"
+vllm:
   enabled: true
+tgi:
+  enabled: false
+lvm-serve:
+  enabled: false
@@ -0,0 +1,25 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+LVM_BACKEND: "vLLM"
+# The default model is not stable on Gaudi, use the older model.
+# https://github.com/HabanaAI/vllm-fork/issues/841
+LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
+vllm:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: opea/vllm-gaudi
+    tag: "latest"
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
+  VLLM_SKIP_WARMUP: true
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
+  extraCmdArgs: ["--tensor-parallel-size", "1", "--chat-template", "examples/template_llava.jinja"]
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+tgi:
+  enabled: false
+lvm-serve:
+  enabled: false
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 apiVersion: v1
@@ -14,6 +14,12 @@ data:
   {{- if not .Values.LVM_ENDPOINT }}
   LVM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
   {{- end }}
+  {{- else if eq "vLLM" .Values.LVM_BACKEND }}
+  LVM_COMPONENT_NAME: "OPEA_VLLM_LVM"
+  MAX_IMAGES: {{ .Values.MAX_IMAGES | default 1 | quote }}
+  {{- if not .Values.LVM_ENDPOINT }}
+  LVM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
+  {{- end }}
   {{- else }}
   {{- if not .Values.LVM_ENDPOINT }}
   LVM_ENDPOINT: "http://{{ .Release.Name }}-lvm-serve"
@@ -34,6 +40,9 @@ data:
   {{- if .Values.LVM_ENDPOINT }}
   LVM_ENDPOINT: {{ tpl .Values.LVM_ENDPOINT . | quote}}
   {{- end }}
+  {{- if .Values.LLM_MODEL_ID }}
+  LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }}
+  {{- end }}
   HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}}
   HF_HOME: "/tmp/.cache/huggingface"
   {{- if .Values.global.HF_ENDPOINT }}
@@ -42,7 +51,7 @@ data:
   http_proxy: {{ .Values.global.http_proxy | quote }}
   https_proxy: {{ .Values.global.https_proxy | quote }}
   {{- if and (not .Values.LVM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }}
-  no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-lvm-serve,{{ .Values.global.no_proxy }}"
+  no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-vllm,{{ .Release.Name }}-lvm-serve,{{ .Values.global.no_proxy }}"
   {{- else }}
   no_proxy: {{ .Values.global.no_proxy | quote }}
   {{- end }}

@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 # Default values for lvm-uservice.
@@ -8,12 +8,15 @@
 # Set it as a non-null string, such as true, if you want to enable.
 LOGFLAG: ""
 # backend inference engine to use, i.e. TGI, LLaVA, VideoLlama, LlamaVision, PredictionGuard
-LVM_BACKEND: "TGI"
+LVM_BACKEND: "vLLM"
 # maximum image number sent to backend, only valid for TGI, LLaVa backend
 MAX_IMAGES: 1
 # inference engine service URL, e.g. http://tgi:80
 LVM_ENDPOINT: ""
 
+# VLLM backend requires LLM_MODEL_ID being set and consistent with vLLM setting.
+LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
+
 replicaCount: 1
 
 image:
@@ -105,6 +108,10 @@ global:
   prometheusRelease: prometheus-stack
 
 # For CI tests only
+vllm:
+  enabled: false
+  LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
+  extraCmdArgs: ["--chat-template", "examples/template_llava.jinja"]
 tgi:
   enabled: false
   LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf

@@ -39,3 +39,6 @@ data:
   {{- if .Values.VLLM_TORCH_PROFILER_DIR }}
   VLLM_TORCH_PROFILER_DIR: {{ .Values.VLLM_TORCH_PROFILER_DIR | quote }}
   {{- end }}
+  {{- if .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE }}
+  PT_HPUGRAPH_DISABLE_TENSOR_CACHE: {{ .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE | quote }}
+  {{- end }}
@@ -97,7 +97,9 @@ visualqna:
   dest_dir: VisualQnA/kubernetes/helm
   values:
     - cpu-values.yaml
+    - cpu-tgi-values.yaml
     - gaudi-values.yaml
+    - gaudi-tgi-values.yaml
 
 # Components
 agent:

@@ -6,9 +6,14 @@ name: visualqna
 description: The Helm chart to deploy VisualQnA
 type: application
 dependencies:
+  - name: vllm
+    version: 0-latest
+    repository: "file://../common/vllm"
+    condition: vllm.enabled
   - name: tgi
     version: 0-latest
     repository: "file://../common/tgi"
+    condition: tgi.enabled
   - name: lvm-uservice
     version: 0-latest
     repository: "file://../common/lvm-uservice"

@@ -3,7 +3,28 @@
 Helm chart for deploying VisualQnA service. VisualQnA depends on the following services:
 
 - [lvm-uservice](../common/lvm-uservice/README.md)
-- [tgi](../common/tgi/README.md)
+- [vllm](../common/vllm/README.md)
+
+## Installing the Chart
+
+To install the chart, run the following:
+
+```console
+cd GenAIInfra/helm-charts/
+./update_dependency.sh
+helm dependency update visualqna
+export HFTOKEN="insert-your-huggingface-token-here"
+export MODELDIR="/mnt/opea-models"
+# To use CPU with vLLM
+helm install visualqna visualqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR}
+# To use Gaudi with vLLM
+# helm install visualqna visualqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f visualqna/gaudi-values.yaml
+
+```
+
+### IMPORTANT NOTE
+
+1. Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model.
 
 ## Verify
 
@@ -41,5 +62,5 @@ Open a browser to access `http://<k8s-node-ip-address>:${port}` to play with the
 | ----------------- | ------ | ------------------------------------- | -------------------------------------------------------------------------------------- |
 | image.repository  | string | `"opea/visualqna"`                    |                                                                                        |
 | service.port      | string | `"8888"`                              |                                                                                        |
-| tgi.LLM_MODEL_ID  | string | `"llava-hf/llava-v1.6-mistral-7b-hf"` | Models id from https://huggingface.co/, or predownloaded model directory               |
+| vllm.LLM_MODEL_ID | string | `"llava-hf/llava-v1.6-mistral-7b-hf"` | Models id from https://huggingface.co/, or predownloaded model directory               |
 | global.monitoring | bool   | `false`                               | Enable usage metrics for the service components. See ../monitoring.md before enabling! |
@@ -0,0 +1,9 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+tgi:
+  enabled: true
+vllm:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "TGI"
@@ -1,7 +1,9 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+vllm:
+  enabled: true
 tgi:
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "vLLM"
@@ -0,0 +1,41 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+# TGI: largest bottleneck for VisualQnA
+tgi:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: ghcr.io/huggingface/tgi-gaudi
+    tag: "2.3.1"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  MAX_INPUT_LENGTH: "4096"
+  MAX_TOTAL_TOKENS: "8192"
+  CUDA_GRAPHS: ""
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  ENABLE_HPU_GRAPH: "true"
+  LIMIT_HPU_GRAPH: "true"
+  USE_FLASH_ATTENTION: "true"
+  FLASH_ATTENTION_RECOMPUTE: "true"
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+vllm:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "TGI"
@@ -1,36 +1,24 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# Accelerate inferencing in heaviest components to improve performance
-# by overriding their subchart values
-
-# TGI: largest bottleneck for VisualQnA
-tgi:
+vllm:
+  enabled: true
   accelDevice: "gaudi"
   image:
-    repository: ghcr.io/huggingface/tgi-gaudi
-    tag: "2.3.1"
+    repository: opea/vllm-gaudi
+    tag: "latest"
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
+  VLLM_SKIP_WARMUP: true
+  OMPI_MCA_btl_vader_single_copy_mechanism: none
+  PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
+  extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
   resources:
     limits:
       habana.ai/gaudi: 1
-  MAX_INPUT_LENGTH: "4096"
-  MAX_TOTAL_TOKENS: "8192"
-  CUDA_GRAPHS: ""
-  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
-  ENABLE_HPU_GRAPH: "true"
-  LIMIT_HPU_GRAPH: "true"
-  USE_FLASH_ATTENTION: "true"
-  FLASH_ATTENTION_RECOMPUTE: "true"
-  livenessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  readinessProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-  startupProbe:
-    initialDelaySeconds: 5
-    periodSeconds: 5
-    timeoutSeconds: 1
-    failureThreshold: 120
+tgi:
+  enabled: false
+lvm-uservice:
+  LVM_BACKEND: "vLLM"
+  # The default model is not stable on Gaudi, use the older model.
+  # https://github.com/HabanaAI/vllm-fork/issues/841
+  LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf