Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
51651aa
Backend enhancements for image query capabilities for MultimodalQnA
dmsuehir Nov 22, 2024
f83e2e1
Fix model name var
dmsuehir Nov 22, 2024
1a61cb5
Merge branch 'mmqna-phase2' of github.com:mhbuehler/GenAIComps into d…
dmsuehir Nov 25, 2024
1f0dfcd
Remove space at end of prompt
dmsuehir Nov 26, 2024
107680d
Merge branch 'mmqna-phase2' of github.com:mhbuehler/GenAIComps into d…
dmsuehir Dec 2, 2024
5b51771
Add env var for the max number of images sent to the LVM
dmsuehir Dec 2, 2024
242ee6f
README update for the MAX_IMAGES env var
dmsuehir Dec 2, 2024
8b21819
Merge branch 'dina/image_query' of github.com:mhbuehler/GenAIComps in…
dmsuehir Dec 2, 2024
5b41724
Remove prints
dmsuehir Dec 2, 2024
ae5437a
Audio query functionality to multimodal backend (#8)
okhleif-10 Dec 2, 2024
f4a7199
Merge branch 'mmqna-phase2' of github.com:mhbuehler/GenAIComps into d…
dmsuehir Dec 3, 2024
e1e5fde
Merge branch 'main' into mmqna-audio-query
mhbuehler Dec 4, 2024
70c54e1
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2024
6a71843
fixed role bug where i never was > 0
okhleif-10 Dec 4, 2024
411bfdf
Fix after merge
dmsuehir Dec 4, 2024
615459b
removed whitespace
okhleif-10 Dec 4, 2024
1753473
Merge pull request #13 from mhbuehler/omar/role-debug
mhbuehler Dec 4, 2024
dcafe8d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 4, 2024
e32bef4
Fix call to get role labels
dmsuehir Dec 4, 2024
63c08fe
Merge branch 'mmqna-audio-query' of github.com:mhbuehler/GenAIComps i…
dmsuehir Dec 4, 2024
db22c47
Gateway test updates images within the conversation
dmsuehir Dec 5, 2024
fa47959
Adds unit test coverage for audio query
mhbuehler Dec 5, 2024
02efc8a
Update test to check the returned b64 types
dmsuehir Dec 5, 2024
d74bb32
Update test since we don't expect images from the assistant
dmsuehir Dec 5, 2024
37826be
Port number fix
mhbuehler Dec 6, 2024
40d34db
Formatting
mhbuehler Dec 6, 2024
6f2a753
Merge pull request #14 from mhbuehler/melanie/add_test_coverage
mhbuehler Dec 6, 2024
a665c3c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 6, 2024
4a5c8ea
Merge branch 'main' into mmqna-audio-query
ashahba Dec 6, 2024
d9ab567
Fixed place where port number is set
mhbuehler Dec 6, 2024
75b135f
Merge pull request #15 from mhbuehler/melanie/port_placement
mhbuehler Dec 6, 2024
9a077c5
Remove old comment and added more accurate description
dmsuehir Dec 9, 2024
b21e575
add comment in code about MAX_IMAGES
dmsuehir Dec 9, 2024
a3abd8a
Add Gaudi support for image query
dmsuehir Dec 10, 2024
b8dbabf
Merge branch 'mmqna-audio-query' of github.com:mhbuehler/GenAIComps i…
dmsuehir Dec 10, 2024
c87504c
Merge branch 'mmqna-image-query' of github.com:mhbuehler/GenAIComps i…
dmsuehir Dec 12, 2024
723f0c3
Fix to pass the retrieved image last
dmsuehir Dec 12, 2024
b1205f4
Revert out gateway and gateway test code, due to its move to GenAIExa…
dmsuehir Dec 12, 2024
bac117a
Fix retriever test for checking for b64_img_str in the result
dmsuehir Dec 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion comps/cores/proto/docarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ class GraphDoc(BaseDoc):


class LVMDoc(BaseDoc):
image: str
image: Union[str, List[str]]
prompt: str
max_new_tokens: conint(ge=0, le=1024) = 512
top_k: int = 10
Expand Down
7 changes: 7 additions & 0 deletions comps/embeddings/multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,18 @@ docker compose -f docker_compose_multimodal_embedding.yaml up -d

**Compute a joint embedding of an image-text pair**

The image can be passed as a URL:
```bash
curl -X POST http://0.0.0.0:6600/v1/embeddings \
-H "Content-Type: application/json" \
-d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}'
```
Or as a base64 encoded string:
```bash
curl -X POST http://0.0.0.0:6600/v1/embeddings \
-H "Content-Type: application/json" \
-d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}'
```

**Compute an embedding of a text**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import requests
from fastapi.responses import JSONResponse
from typing import Union

from comps import (
CustomLogger,
Expand Down Expand Up @@ -38,7 +39,7 @@
output_datatype=EmbedMultimodalDoc,
)
@register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"])
def embedding(input: MultimodalDoc) -> EmbedDoc:
def embedding(input: MultimodalDoc) -> Union[EmbedDoc, EmbedMultimodalDoc]:
start = time.time()
if logflag:
logger.info(input)
Expand All @@ -48,9 +49,15 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
json["text"] = input.text
elif isinstance(input, TextImageDoc):
json["text"] = input.text.text
img_bytes = input.image.url.load_bytes()
base64_img = base64.b64encode(img_bytes).decode("utf-8")
json["img_b64_str"] = base64_img
base64_img = ""
if input.image.url:
img_bytes = input.image.url.load_bytes()
base64_img = base64.b64encode(img_bytes).decode("utf-8")
elif input.image.base64_image:
base64_img = input.image.base64_image

if base64_img:
json["img_b64_str"] = base64_img
else:
return JSONResponse(status_code=400, content={"message": "Bad request!"})

Expand All @@ -66,6 +73,9 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
res = EmbedDoc(text=input.text, embedding=embed_vector)
elif isinstance(input, TextImageDoc):
res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector)

if base64_img:
res.base64_image = base64_img
except requests.exceptions.ConnectionError:
res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"})
statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None)
Expand Down
12 changes: 10 additions & 2 deletions comps/lvms/llava/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# LVM Microservice

Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and images. It outputs the answer to the prompt about the images.

## 🚀1. Start Microservice with Python (Option 1)

Expand Down Expand Up @@ -92,10 +92,15 @@ docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_M

#### 2.2.2 Start LVM service

> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`.

```bash
ip_address=$(hostname -I | awk '{print $1}')

docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm-llava-svc:latest
docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 -e MAX_IMAGES=1 opea/lvm-llava-svc:latest
```

#### 2.2.3 Test
Expand All @@ -106,6 +111,9 @@ docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$htt
# curl with an image and a prompt
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'

# curl with multiple images and a prompt (Note that depending on your MAX_IMAGES value, both images may not be sent to the LLaVA model)
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What is in these images?"}' -H 'Content-Type: application/json'

# curl with a prompt only (no image)
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'

Expand Down
83 changes: 68 additions & 15 deletions comps/lvms/llava/dependency/llava_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response
from transformers import AutoProcessor
from transformers import pipeline
from transformers.image_utils import load_image

Expand All @@ -33,9 +34,16 @@ def pipeline_preprocess(self, image, prompt=None, timeout=None):
The original transformers image-to-text pipeline preprocess function requires that an image is passed in, and will
fail if the image parameter is null/empty. In order to support multimodal use cases with the same pipeline, this
preprocess function handles the case where there is no image with the prompt.
Also, the image-to-text pipeline typically treats multiple images passed in as a list as a batch (where it iterates
over the image inputs for generation). For that reason, the original pipeline_preprocess code would only get a
single image at a time. To support multiple images, the pipeline call is updated to send a list of lists for the
images (so that when iterated, we still get multiple images) and this pipeline_preprocess function has been updated
to handle a list of images in addition to single images.
"""

if image:
if isinstance(image, list):
image = [load_image(i, timeout=timeout) for i in image]
elif image:
image = load_image(image, timeout=timeout)

if prompt is not None:
Expand Down Expand Up @@ -114,23 +122,52 @@ async def health() -> Response:


@app.post("/generate")
async def generate(request: Request) -> Response: # FIXME batch_size=1 for now, only accept single image
async def generate(request: Request) -> Response: # FIXME batch_size=1 for now
print("LLaVA generation begin.")
request_dict = await request.json()
prompt = request_dict.pop("prompt")
img_b64_str = request_dict.pop("img_b64_str")
img_b64_str = request_dict.pop("img_b64_str") # String or list of strings
max_new_tokens = request_dict.pop("max_new_tokens", 100)

# Determine the format of the role labels based on the model name
model_name = generator.model.name_or_path
user_label = "USER:"
assistant_label = "ASSISTANT:"
image_tag = "<image>\n"

# This is the role label that we see in the results from the pipeline. This is used to split the output.
output_assistant_label = "ASSISTANT: "

if "llava-interleave" in model_name:
user_label = "<|im_start|>user"
assistant_label = "<|im_end|><|im_start|>assistant"
output_assistant_label = "assistant "
elif "llava-v1.6-mistral" in model_name:
user_label = "[INST]"
assistant_label = " [/INST]"
output_assistant_label = "[/INST] "

if img_b64_str:
# Decode and Resize the image
image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str)))
image = process_image(image)
# format the prompt with an image
prompt = f"<image>\nUSER: {prompt}\nASSISTANT:"
if isinstance(img_b64_str, str):
img_b64_str = [img_b64_str]

# Decode and Resize the images
images = []
for img_b64 in img_b64_str:
if img_b64:
image = PIL.Image.open(BytesIO(base64.b64decode(img_b64)))
image = process_image(image)
images.append(image)

# If the prompt provided does not have all the image tags, format the prompt with images
num_images = len(images)
num_image_tags = prompt.count(image_tag)
image_tags = image_tag * (num_images - num_image_tags) if num_images > num_image_tags else ""
prompt = f"{user_label}{image_tags} {prompt}{assistant_label}"
else:
image = None
images = None
# format the prompt with text only
prompt = f"USER: {prompt}\nASSISTANT:"
prompt = f"{user_label} {prompt}\n{assistant_label}"

if args.device == "hpu":
generate_kwargs = {
Expand All @@ -149,12 +186,13 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now,
# Override the pipeline preprocessing
generator.preprocess = pipeline_preprocess.__get__(generator, type(generator))

result = generator(image, prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
result = generator([images], prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
end = time.time()
result = result[0]["generated_text"].split("ASSISTANT: ")[-1]
result = result[0][0]["generated_text"].split(output_assistant_label.strip())[-1].strip()
print(f"LLaVA result = {result}, time = {(end-start) * 1000 }ms")
if image:
image.close()
if images:
for i in images:
i.close()

ret = {"text": result}
return JSONResponse(ret)
Expand Down Expand Up @@ -191,6 +229,8 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now,
device=args.device,
)

processor = AutoProcessor.from_pretrained(model_name_or_path)

# warmup
print("LLaVA warmup...")
if args.device == "hpu":
Expand All @@ -214,10 +254,23 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now,
images = []
for image_path in image_paths:
images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))

# Generate a text prompt to use for warm up
conversation = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What's the content of the image?"},
],
},
]
text_prompt = processor.apply_chat_template(conversation)

for i in range(args.warmup):
generator(
images,
prompt="<image>\nUSER: What's the content of the image?\nASSISTANT:",
prompt=text_prompt,
batch_size=1,
generate_kwargs=generate_kwargs,
)
Expand Down
17 changes: 17 additions & 0 deletions comps/lvms/llava/lvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
logger = CustomLogger("lvm")
logflag = os.getenv("LOGFLAG", False)

# The maximum number of images that should be sent to the LVM
max_images = int(os.getenv("MAX_IMAGES", 1))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this line, is 1 being set manually? Or is it a default?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This means that if MAX_IMAGES is unset, it will default to 1



@register_microservice(
name="opea_service@lvm",
Expand Down Expand Up @@ -76,6 +79,17 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
prompt = request.prompt
max_new_tokens = request.max_new_tokens

# Limit the number of images being sent to the LVM
if isinstance(img_b64_str, list) and len(img_b64_str) > max_images:
img_b64_str=img_b64_str[-max_images:]

# Adjust the number of images tags in the prompt
image_tag = "<image>\n"
num_tags_in_prompt = prompt.count(image_tag)

if len(img_b64_str) < num_tags_in_prompt:
prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))

inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens}
# forward to the LLaVA server
response = requests.post(url=f"{lvm_endpoint}/generate", data=json.dumps(inputs), proxies={"http": None})
Expand All @@ -99,5 +113,8 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
if __name__ == "__main__":
lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")

if logflag:
logger.info(f"MAX_IMAGES: {max_images}")

logger.info("[LVM] LVM initialized.")
opea_microservices["opea_service@lvm"].start()
47 changes: 39 additions & 8 deletions comps/lvms/tgi-llava/lvm_tgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
logger = CustomLogger("lvm_tgi")
logflag = os.getenv("LOGFLAG", False)

# The maximum number of images that should be sent to the LVM
max_images = int(os.getenv("MAX_IMAGES", 1))


@register_microservice(
name="opea_service@lvm_tgi",
Expand Down Expand Up @@ -88,15 +91,41 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
top_k = request.top_k
top_p = request.top_p

if not img_b64_str:
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
# Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
# https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
prompt = f"Please disregard the image and answer the question. {prompt}"
# Make img_b64_str into a list of strings (if it's not already a list)
if not isinstance(img_b64_str, list):
if img_b64_str:
img_b64_str = [img_b64_str]
else:
# If img_b64_str was an empty string, which means we have just have a text prompt.
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
# Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
# https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"]
prompt = f"Please disregard the image and answer the question. {prompt}"

# Truncate the list of images if we have too many, only sending the most recent ones at the end of the list
if len(img_b64_str) > max_images:
img_b64_str=img_b64_str[-max_images:]

image = f"data:image/png;base64,{img_b64_str}"
image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
# Check the number of image tags in the prompt and adjust them to match the number of images that we have
image_tag = "<image>\n"
num_tags_in_prompt = prompt.count(image_tag)

# We have too many image tags in the prompt replace the first x instance of the tag with an empty string
if len(img_b64_str) < num_tags_in_prompt:
prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))

# We don't have enough image tags in the prompt, add them
if len(img_b64_str) > num_tags_in_prompt:
num_tags_to_add = len(img_b64_str) - num_tags_in_prompt
tags_to_add = image_tag * num_tags_to_add
prompt = f"{tags_to_add}{prompt}"

# Replace image tags with the data
for i in img_b64_str:
formatted_image_str = f"![](data:image/png;base64,{i})\n"
prompt = prompt.replace(image_tag, formatted_image_str, 1)
image_prompt = f"{prompt}\nASSISTANT:"

if streaming:

Expand Down Expand Up @@ -152,4 +181,6 @@ async def stream_generator():
lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
lvm_client = AsyncInferenceClient(lvm_endpoint)
logger.info("[LVM] LVM initialized.")
if logflag:
logger.info(f"MAX_IMAGES: {max_images}")
opea_microservices["opea_service@lvm_tgi"].start()
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ async def retrieve(
if isinstance(input, EmbedMultimodalDoc):
metadata_list = []
for r in search_res:
# If the input had an image, pass that through in the metadata along with the search result image
if input.base64_image:
if r.metadata["b64_img_str"]:
r.metadata["b64_img_str"] = [input.base64_image, r.metadata["b64_img_str"]]
else:
r.metadata["b64_img_str"] = input.base64_image
metadata_list.append(r.metadata)
retrieved_docs.append(TextDoc(text=r.page_content))
result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list)
Expand Down
16 changes: 16 additions & 0 deletions tests/embeddings/test_embeddings_multimodal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ function validate_microservice_image_text_pair_embedding() {
fi
}

function validate_microservice_b64_image_text_pair_embedding() {
result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
-X POST \
-H "Content-Type: application/json" \
-d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}')

if [[ $result == *"embedding"* ]]; then
echo "Result correct."
else
echo "Result wrong. Received was $result"
docker logs embedding-multimodal-bridgetower
docker logs embedding-multimodal
exit 1
fi
}

function validate_microservice() {
validate_microservice_text_embedding
validate_microservice_image_text_pair_embedding
Expand Down
Loading