Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion helm-charts/common/lvm-uservice/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: v2
Expand All @@ -13,6 +13,10 @@ dependencies:
version: 0-latest
repository: file://../tgi
condition: tgi.enabled
- name: vllm
version: 0-latest
repository: file://../vllm
condition: vllm.enabled
- name: lvm-serve
version: 0-latest
repository: file://../lvm-serve
Expand Down
30 changes: 19 additions & 11 deletions helm-charts/common/lvm-uservice/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ lvm-uservice depends on one of the following backend services:

- TGI: please refer to [tgi](../tgi) chart for more information

- vLLM: please refer to [vllm](../vllm) chart for more information

- one of the large vision model inference engine: please refer to [lvm-serve](../lvm-serve) chart for more information

First, you need to get the dependent service deployed, i.e. deploy the tgi helm chart, or lvm helm chart.
First, you need to get the dependent service deployed, i.e. deploy the tgi helm chart, vllm helm chart or lvm helm chart.

After you've deployed the dependent service successfully, please run `kubectl get svc` to get the backend service URL, e.g. `http://tgi`, `http://lvm-serve`.
After you've deployed the dependent service successfully, please run `kubectl get svc` to get the backend service URL, e.g. `http://tgi`, `http://myvllm` or `http://lvm-serve`.

To install the `lvm-uservice` chart, run the following:

Expand All @@ -21,10 +23,16 @@ cd GenAIInfra/helm-charts/common/lvm-uservice
helm dependency update
export HFTOKEN="insert-your-huggingface-token-here"

# Use vLLM as the backend
export LLM_MODEL_ID="model-id-used-for-vllm"
export LVM_BACKEND="vLLM"
export LVM_ENDPOINT="http://myvllm"
helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait

# Use TGI as the backend
export LVM_BACKEND="TGI"
export LVM_ENDPOINT="http://tgi"
helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait
# export LVM_BACKEND="TGI"
# export LVM_ENDPOINT="http://tgi"
# helm install lvm-uservice . --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set LVM_BACKEND=${LVM_BACKEND} --set LVM_ENDPOINT=${LVM_ENDPOINT} --wait

# Use other lvm-serve engine variant as the backend, see file `values.yaml` more details
# export LVM_ENDPOINT="http://lvm-serve"
Expand All @@ -49,9 +57,9 @@ curl http://localhost:9000/v1/lvm \

## Values

| Key | Type | Default | Description |
| ------------------------------- | ------ | ------- | ------------------------------------------------------------------------------------------------- |
| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token |
| LVM_BACKEND | string | `"TGI"` | lvm backend engine, possible value "TGI", "LLaVA", "VideoLlama", "LlamaVision", "PredictionGuard" |
| LVM_ENDPOINT | string | `""` | LVM endpoint |
| global.monitoring | bool | `false` | Service usage metrics |
| Key | Type | Default | Description |
| ------------------------------- | ------ | -------- | --------------------------------------------------------------------------------------------------------- |
| global.HUGGINGFACEHUB_API_TOKEN | string | `""` | Your own Hugging Face API token |
| LVM_BACKEND | string | `"vLLM"` | lvm backend engine, possible value "vLLM", "TGI", "LLaVA", "VideoLlama", "LlamaVision", "PredictionGuard" |
| LVM_ENDPOINT | string | `""` | LVM endpoint |
| global.monitoring | bool | `false` | Service usage metrics |
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# SPDX-License-Identifier: Apache-2.0

LVM_BACKEND: "LLaVA"
vllm:
enabled: false
tgi:
enabled: false
lvm-serve:
Expand Down
10 changes: 10 additions & 0 deletions helm-charts/common/lvm-uservice/cpu-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

LVM_BACKEND: "TGI"
vllm:
enabled: false
tgi:
enabled: true
llava:
enabled: false
9 changes: 7 additions & 2 deletions helm-charts/common/lvm-uservice/cpu-values.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

tgi:
LVM_BACKEND: "vLLM"
vllm:
enabled: true
tgi:
enabled: false
lvm-serve:
enabled: false
25 changes: 25 additions & 0 deletions helm-charts/common/lvm-uservice/gaudi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

LVM_BACKEND: "vLLM"
# The default model is not stable on Gaudi, use the older model.
# https://github.com/HabanaAI/vllm-fork/issues/841
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: opea/vllm-gaudi
tag: "latest"
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
VLLM_SKIP_WARMUP: true
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
extraCmdArgs: ["--tensor-parallel-size", "1", "--chat-template", "examples/template_llava.jinja"]
resources:
limits:
habana.ai/gaudi: 1
tgi:
enabled: false
lvm-serve:
enabled: false
13 changes: 11 additions & 2 deletions helm-charts/common/lvm-uservice/templates/configmap.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: v1
Expand All @@ -14,6 +14,12 @@ data:
{{- if not .Values.LVM_ENDPOINT }}
LVM_ENDPOINT: "http://{{ .Release.Name }}-tgi"
{{- end }}
{{- else if eq "vLLM" .Values.LVM_BACKEND }}
LVM_COMPONENT_NAME: "OPEA_VLLM_LVM"
MAX_IMAGES: {{ .Values.MAX_IMAGES | default 1 | quote }}
{{- if not .Values.LVM_ENDPOINT }}
LVM_ENDPOINT: "http://{{ .Release.Name }}-vllm"
{{- end }}
{{- else }}
{{- if not .Values.LVM_ENDPOINT }}
LVM_ENDPOINT: "http://{{ .Release.Name }}-lvm-serve"
Expand All @@ -34,6 +40,9 @@ data:
{{- if .Values.LVM_ENDPOINT }}
LVM_ENDPOINT: {{ tpl .Values.LVM_ENDPOINT . | quote}}
{{- end }}
{{- if .Values.LLM_MODEL_ID }}
LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }}
{{- end }}
HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}}
HF_HOME: "/tmp/.cache/huggingface"
{{- if .Values.global.HF_ENDPOINT }}
Expand All @@ -42,7 +51,7 @@ data:
http_proxy: {{ .Values.global.http_proxy | quote }}
https_proxy: {{ .Values.global.https_proxy | quote }}
{{- if and (not .Values.LVM_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy) }}
no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-lvm-serve,{{ .Values.global.no_proxy }}"
no_proxy: "{{ .Release.Name }}-tgi,{{ .Release.Name }}-vllm,{{ .Release.Name }}-lvm-serve,{{ .Values.global.no_proxy }}"
{{- else }}
no_proxy: {{ .Values.global.no_proxy | quote }}
{{- end }}
Expand Down
11 changes: 9 additions & 2 deletions helm-charts/common/lvm-uservice/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Default values for lvm-uservice.
Expand All @@ -8,12 +8,15 @@
# Set it as a non-null string, such as true, if you want to enable.
LOGFLAG: ""
# backend inference engine to use, i.e. TGI, LLaVA, VideoLlama, LlamaVision, PredictionGuard
LVM_BACKEND: "TGI"
LVM_BACKEND: "vLLM"
# maximum image number sent to backend, only valid for TGI, LLaVa backend
MAX_IMAGES: 1
# inference engine service URL, e.g. http://tgi:80
LVM_ENDPOINT: ""

# VLLM backend requires LLM_MODEL_ID being set and consistent with vLLM setting.
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf

replicaCount: 1

image:
Expand Down Expand Up @@ -105,6 +108,10 @@ global:
prometheusRelease: prometheus-stack

# For CI tests only
vllm:
enabled: false
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
extraCmdArgs: ["--chat-template", "examples/template_llava.jinja"]
tgi:
enabled: false
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
Expand Down
3 changes: 3 additions & 0 deletions helm-charts/common/vllm/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,6 @@ data:
{{- if .Values.VLLM_TORCH_PROFILER_DIR }}
VLLM_TORCH_PROFILER_DIR: {{ .Values.VLLM_TORCH_PROFILER_DIR | quote }}
{{- end }}
{{- if .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE }}
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: {{ .Values.PT_HPUGRAPH_DISABLE_TENSOR_CACHE | quote }}
{{- end }}
2 changes: 2 additions & 0 deletions helm-charts/valuefiles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ visualqna:
dest_dir: VisualQnA/kubernetes/helm
values:
- cpu-values.yaml
- cpu-tgi-values.yaml
- gaudi-values.yaml
- gaudi-tgi-values.yaml

# Components
agent:
Expand Down
5 changes: 5 additions & 0 deletions helm-charts/visualqna/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@ name: visualqna
description: The Helm chart to deploy VisualQnA
type: application
dependencies:
- name: vllm
version: 0-latest
repository: "file://../common/vllm"
condition: vllm.enabled
- name: tgi
version: 0-latest
repository: "file://../common/tgi"
condition: tgi.enabled
- name: lvm-uservice
version: 0-latest
repository: "file://../common/lvm-uservice"
Expand Down
25 changes: 23 additions & 2 deletions helm-charts/visualqna/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,28 @@
Helm chart for deploying VisualQnA service. VisualQnA depends on the following services:

- [lvm-uservice](../common/lvm-uservice/README.md)
- [tgi](../common/tgi/README.md)
- [vllm](../common/vllm/README.md)

## Installing the Chart

To install the chart, run the following:

```console
cd GenAIInfra/helm-charts/
./update_dependency.sh
helm dependency update visualqna
export HFTOKEN="insert-your-huggingface-token-here"
export MODELDIR="/mnt/opea-models"
# To use CPU with vLLM
helm install visualqna visualqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR}
# To use Gaudi with vLLM
# helm install visualqna visualqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f visualqna/gaudi-values.yaml

```

### IMPORTANT NOTE

1. Make sure your `MODELDIR` exists on the node where your workload is scheduled so you can cache the downloaded model for next time use. Otherwise, set `global.modelUseHostPath` to 'null' if you don't want to cache the model.

## Verify

Expand Down Expand Up @@ -41,5 +62,5 @@ Open a browser to access `http://<k8s-node-ip-address>:${port}` to play with the
| ----------------- | ------ | ------------------------------------- | -------------------------------------------------------------------------------------- |
| image.repository | string | `"opea/visualqna"` | |
| service.port | string | `"8888"` | |
| tgi.LLM_MODEL_ID | string | `"llava-hf/llava-v1.6-mistral-7b-hf"` | Models id from https://huggingface.co/, or predownloaded model directory |
| vllm.LLM_MODEL_ID | string | `"llava-hf/llava-v1.6-mistral-7b-hf"` | Models id from https://huggingface.co/, or predownloaded model directory |
| global.monitoring | bool | `false` | Enable usage metrics for the service components. See ../monitoring.md before enabling! |
9 changes: 9 additions & 0 deletions helm-charts/visualqna/cpu-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

tgi:
enabled: true
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"
10 changes: 6 additions & 4 deletions helm-charts/visualqna/cpu-values.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

vllm:
enabled: true
tgi:
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
LLM_MODEL_ID: llava-hf/llava-v1.6-mistral-7b-hf
enabled: false
lvm-uservice:
LVM_BACKEND: "vLLM"
41 changes: 41 additions & 0 deletions helm-charts/visualqna/gaudi-tgi-values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

# TGI: largest bottleneck for VisualQnA
tgi:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
vllm:
enabled: false
lvm-uservice:
LVM_BACKEND: "TGI"
46 changes: 17 additions & 29 deletions helm-charts/visualqna/gaudi-values.yaml
Original file line number Diff line number Diff line change
@@ -1,36 +1,24 @@
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Accelerate inferencing in heaviest components to improve performance
# by overriding their subchart values

# TGI: largest bottleneck for VisualQnA
tgi:
vllm:
enabled: true
accelDevice: "gaudi"
image:
repository: ghcr.io/huggingface/tgi-gaudi
tag: "2.3.1"
repository: opea/vllm-gaudi
tag: "latest"
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
VLLM_SKIP_WARMUP: true
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: "false"
extraCmdArgs: ["--tensor-parallel-size","1","--chat-template","examples/template_llava.jinja"]
resources:
limits:
habana.ai/gaudi: 1
MAX_INPUT_LENGTH: "4096"
MAX_TOTAL_TOKENS: "8192"
CUDA_GRAPHS: ""
OMPI_MCA_btl_vader_single_copy_mechanism: "none"
ENABLE_HPU_GRAPH: "true"
LIMIT_HPU_GRAPH: "true"
USE_FLASH_ATTENTION: "true"
FLASH_ATTENTION_RECOMPUTE: "true"
livenessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
startupProbe:
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 1
failureThreshold: 120
tgi:
enabled: false
lvm-uservice:
LVM_BACKEND: "vLLM"
# The default model is not stable on Gaudi, use the older model.
# https://github.com/HabanaAI/vllm-fork/issues/841
LLM_MODEL_ID: llava-hf/llava-1.5-7b-hf
Loading