Skip to content
Merged
71 changes: 70 additions & 1 deletion comps/lvms/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,58 @@ services:
interval: 30s
timeout: 6s
retries: 20
vllm-service:
image: ${REGISTRY:-opea}/vllm:latest
container_name: vllm-service
ports:
- ${VLLM_PORT:-9699}:80
volumes:
- "./data:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
interval: 10s
timeout: 10s
retries: 100
command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
vllm-gaudi-service:
image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
container_name: vllm-gaudi-service
ports:
- ${VLLM_PORT:-9699}:80
volumes:
- "./data:/data"
shm_size: 128g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
LLM_MODEL_ID: ${LLM_MODEL_ID}
VLLM_TORCH_PROFILER_DIR: "/mnt"
VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
MAX_MODEL_LEN: ${MAX_TOTAL_TOKENS:-4096}
MAX_SEQ_LEN_TO_CAPTURE: ${MAX_TOTAL_TOKENS:-4096}
PT_HPUGRAPH_DISABLE_TENSOR_CACHE: false # https://github.com/HabanaAI/vllm-fork/issues/841#issuecomment-2700421704
runtime: habana
cap_add:
- SYS_NICE
ipc: host
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:80/health || exit 1"]
interval: 10s
timeout: 10s
retries: 150
command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --chat-template examples/template_llava.jinja # https://docs.vllm.ai/en/v0.5.0/models/vlm.html
llava-tgi-service:
image: ghcr.io/huggingface/tgi-gaudi:2.3.1
container_name: llava-tgi-service
Expand Down Expand Up @@ -99,7 +151,8 @@ services:
ipc: host
environment:
LVM_ENDPOINT: ${LVM_ENDPOINT}
LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_LLAVA_LVM}
LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
LLM_MODEL_ID: ${LLM_MODEL_ID}
lvm-llava:
extends: lvm
container_name: lvm-llava-service
Expand Down Expand Up @@ -140,6 +193,22 @@ services:
depends_on:
video-llama-service:
condition: service_healthy
lvm-vllm:
extends: lvm
container_name: lvm-vllm-service
environment:
LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
depends_on:
vllm-service:
condition: service_healthy
lvm-vllm-gaudi:
extends: lvm
container_name: lvm-vllm-gaudi-service
environment:
LVM_COMPONENT_NAME: ${LVM_COMPONENT_NAME:-OPEA_VLLM_LVM}
depends_on:
vllm-gaudi-service:
condition: service_healthy

networks:
default:
Expand Down
24 changes: 22 additions & 2 deletions comps/lvms/src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,36 @@ export LVM_ENDPOINT=http://$ip_address:$VIDEO_LLAMA_PORT
docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up video-llama-service lvm-video-llama -d
```

- vLLM

```bash
# currently you have to build the opea/vllm-gaudi with the habana_main branch locally
git clone https://github.com/HabanaAI/vllm-fork.git
cd ./vllm-fork/
git checkout habana_main
docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
cd ..
rm -rf vllm-fork


export ip_address=$(hostname -I | awk '{print $1}')
export LVM_PORT=9399
export VLLM_PORT=11507
export LVM_ENDPOINT=http://$ip_address:$VLLM_PORT
export LLM_MODEL_ID=llava-hf/llava-1.5-7b-hf
docker compose -f comps/lvms/deployment/docker_compose/compose.yaml up vllm-service lvm-vllm -d
```

## Test

- LLaVA & llama-vision & PredictionGuard & TGI LLaVA
- vLLM & LLaVA native & llama-vision & PredictionGuard & TGI LLaVA

```bash
# curl with an image and a prompt
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'

# curl with only the prompt
http_proxy="" curl http://localhost:9399/v1/lvm --silent --write-out "HTTPSTATUS:%{http_code}" -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
```

- video-llama
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ iopath
langchain
langchain-community
langchain-core
numpy
numpy==1.26.4
omegaconf
opencv-python-headless
opentelemetry-api
Expand Down
222 changes: 222 additions & 0 deletions comps/lvms/src/integrations/vllm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os
import time
from typing import Union

import requests
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from langchain_core.prompts import PromptTemplate
from openai import OpenAI

from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
OpeaComponent,
OpeaComponentRegistry,
ServiceType,
TextDoc,
statistics_dict,
)

logger = CustomLogger("opea_vllm")
logflag = os.getenv("LOGFLAG", False)

# The maximum number of images that should be sent to the LVM
# max_images = int(os.getenv("MAX_IMAGES", 1))
LLM_MODEL_ID = os.getenv("LLM_MODEL_ID", "llava-hf/llava-1.5-7b-hf")


class ChatTemplate:

@staticmethod
def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):

if has_image:
template = """The transcript associated with the image is '{context}'. {question}"""
else:
template = (
"""Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
)

return template.format(context=context, question=question)


@OpeaComponentRegistry.register("OPEA_VLLM_LVM")
class OpeaVllmLvm(OpeaComponent):
"""A specialized vLLM LVM component derived from OpeaComponent for vLLM LVM services."""

def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
# https://github.com/huggingface/huggingface_hub/blob/v0.29.1/src/huggingface_hub/inference/_providers/hf_inference.py#L87
# latest AsyncInferenceClient has model hardcoded issues to "tgi"
# so we use OpenAI client
self.lvm_client = OpenAI(api_key="EMPTY", base_url=f"{self.base_url}/v1")
health_status = self.check_health()
# if logflag:
# logger.info(f"MAX_IMAGES: {max_images}")
if not health_status:
logger.error("OpeaVllmLvm health check failed.")

async def invoke(
self,
request: Union[LVMDoc, LVMSearchedMultimodalDoc],
) -> Union[TextDoc, MetadataTextDoc]:
"""Involve the LVM service to generate answer for the provided input."""
if logflag:
logger.info(request)
if isinstance(request, LVMSearchedMultimodalDoc):
# TODO may bugs here
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")

img_b64_str = retrieved_metadatas[0]["b64_img_str"]
has_image = img_b64_str != ""
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
prompt = initial_query
if request.chat_template is None:
prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
else:
prompt_template = PromptTemplate.from_template(request.chat_template)
input_variables = prompt_template.input_variables
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=initial_query, context=context)
else:
logger.info(
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
top_p = request.top_p
if logflag:
logger.info(
f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}"
)

else:
# TODO align legacy LVMDoc with chat completions parameters for vLLM
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
stream = request.stream
# repetition_penalty = request.repetition_penalty
temperature = request.temperature
# top_k = request.top_k
top_p = request.top_p

if not img_b64_str:
# If img_b64_str was an empty string, which means we have just have a text prompt.
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
# Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
# https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"

if stream:
t_start = time.time()

def stream_generator(time_start):
first_token_latency = None
chat_response = ""

# https://docs.vllm.ai/en/v0.5.1/getting_started/examples/openai_vision_api_client.html
# vLLM chat completions api
# TODO align legacy LVMDoc with chat completions parameters for vLLM
# Now we simply keep the intersection of them
# TODO check vLLM multi-image inputs https://platform.openai.com/docs/guides/vision#multiple-image-inputs
text_generation = self.lvm_client.chat.completions.create(
model=LLM_MODEL_ID,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64_str}"}},
],
}
],
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
stream=True,
)

for output in text_generation:
if first_token_latency is None:
first_token_latency = time.time() - time_start
text = output.choices[0].delta.content
chat_response += text
chunk_repr = repr(text.encode("utf-8"))
if logflag:
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
yield f"data: {chunk_repr}\n\n"
if logflag:
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
statistics_dict["opea_service@lvm"].append_latency(time.time() - time_start, first_token_latency)
yield "data: [DONE]\n\n"

return StreamingResponse(stream_generator(t_start), media_type="text/event-stream")
else:
# https://docs.vllm.ai/en/v0.5.1/getting_started/examples/openai_vision_api_client.html
# vLLM chat completions api
# TODO align legacy LVMDoc with chat completions parameters for vLLM
# Now we simply keep the intersection of them
# TODO check vLLM multi-image inputs https://platform.openai.com/docs/guides/vision#multiple-image-inputs
generated_output = self.lvm_client.chat.completions.create(
model=LLM_MODEL_ID,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64_str}"}},
],
}
],
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
)
generated_str = generated_output.choices[0].message.content

if logflag:
logger.info(generated_str)
if isinstance(request, LVMSearchedMultimodalDoc):
# TODO Check bugs here
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
return MetadataTextDoc(text=generated_str, metadata=return_metadata)
else:
return TextDoc(text=generated_str)

def check_health(self) -> bool:
"""Checks the health of the embedding service.

Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False
5 changes: 3 additions & 2 deletions comps/lvms/src/opea_lvm_microservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from integrations.predictionguard import OpeaPredictionguardLvm
from integrations.tgi_llava import OpeaTgiLlavaLvm
from integrations.video_llama import OpeaVideoLlamaLvm
from integrations.vllm import OpeaVllmLvm

from comps import (
CustomLogger,
Expand All @@ -29,7 +30,7 @@
logger = CustomLogger("opea_lvm_microservice")
logflag = os.getenv("LOGFLAG", False)

lvm_component_name = os.getenv("LVM_COMPONENT_NAME", "OPEA_LLAVA_LVM")
lvm_component_name = os.getenv("LVM_COMPONENT_NAME", "OPEA_VLLM_LVM")
# Initialize OpeaComponentController
loader = OpeaComponentLoader(lvm_component_name, description=f"OPEA LVM Component: {lvm_component_name}")

Expand All @@ -54,7 +55,7 @@ async def lvm(
logger.info(lvm_response)

if loader.component.name in ["OpeaVideoLlamaLvm"] or (
loader.component.name in ["OpeaTgiLlavaLvm"] and request.streaming
loader.component.name in ["OpeaTgiLlavaLvm", "OpeaVllmLvm"] and request.streaming
):
# statistics for StreamingResponse are handled inside the integrations
# here directly return the response
Expand Down
1 change: 1 addition & 0 deletions comps/lvms/src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
fastapi
huggingface_hub
langchain-core
openai
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
Expand Down
Loading
Loading