mhbuehler · dmsuehir · Dec 16, 2024 · Nov 22, 2024 · Nov 22, 2024 · Nov 25, 2024
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
@@ -278,7 +278,7 @@ class GraphDoc(BaseDoc):
 
 
 class LVMDoc(BaseDoc):
-    image: str
+    image: Union[str, List[str]]
     prompt: str
     max_new_tokens: conint(ge=0, le=1024) = 512
     top_k: int = 10

diff --git a/comps/embeddings/multimodal/README.md b/comps/embeddings/multimodal/README.md
@@ -170,11 +170,18 @@ docker compose -f docker_compose_multimodal_embedding.yaml up -d
 
 **Compute a joint embedding of an image-text pair**
 
+The image can be passed as a URL:
 ```bash
 curl -X POST http://0.0.0.0:6600/v1/embeddings \
      -H "Content-Type: application/json" \
      -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}'
 ```
+Or as a base64 encoded string:
+```bash
+curl -X POST http://0.0.0.0:6600/v1/embeddings \
+     -H "Content-Type: application/json" \
+     -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}'
+```
 
 **Compute an embedding of a text**
 

diff --git a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py
@@ -7,6 +7,7 @@
 
 import requests
 from fastapi.responses import JSONResponse
+from typing import Union
 
 from comps import (
     CustomLogger,
@@ -38,7 +39,7 @@
     output_datatype=EmbedMultimodalDoc,
 )
 @register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"])
-def embedding(input: MultimodalDoc) -> EmbedDoc:
+def embedding(input: MultimodalDoc) -> Union[EmbedDoc, EmbedMultimodalDoc]:
     start = time.time()
     if logflag:
         logger.info(input)
@@ -48,9 +49,15 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
         json["text"] = input.text
     elif isinstance(input, TextImageDoc):
         json["text"] = input.text.text
-        img_bytes = input.image.url.load_bytes()
-        base64_img = base64.b64encode(img_bytes).decode("utf-8")
-        json["img_b64_str"] = base64_img
+        base64_img = ""
+        if input.image.url:
+            img_bytes = input.image.url.load_bytes()
+            base64_img = base64.b64encode(img_bytes).decode("utf-8")
+        elif input.image.base64_image:
+            base64_img = input.image.base64_image
+
+        if base64_img:
+            json["img_b64_str"] = base64_img
     else:
         return JSONResponse(status_code=400, content={"message": "Bad request!"})
 
@@ -66,6 +73,9 @@ def embedding(input: MultimodalDoc) -> EmbedDoc:
             res = EmbedDoc(text=input.text, embedding=embed_vector)
         elif isinstance(input, TextImageDoc):
             res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector)
+
+            if base64_img:
+                res.base64_image = base64_img
     except requests.exceptions.ConnectionError:
         res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"})
     statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None)

diff --git a/comps/lvms/llava/README.md b/comps/lvms/llava/README.md
@@ -1,6 +1,6 @@
 # LVM Microservice
 
-Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
+Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and images. It outputs the answer to the prompt about the images.
 
 ## 🚀1. Start Microservice with Python (Option 1)
 
@@ -92,10 +92,15 @@ docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_M
 
 #### 2.2.2 Start LVM service
 
+> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server.
+> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list
+> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not
+> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`.
+
 ```bash
 ip_address=$(hostname -I | awk '{print $1}')
 
-docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm-llava-svc:latest
+docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 -e MAX_IMAGES=1 opea/lvm-llava-svc:latest
 ```
 
 #### 2.2.3 Test
@@ -106,6 +111,9 @@ docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$htt
 # curl with an image and a prompt
 http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'
 
+# curl with multiple images and a prompt (Note that depending on your MAX_IMAGES value, both images may not be sent to the LLaVA model)
+http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What is in these images?"}' -H 'Content-Type: application/json'
+
 # curl with a prompt only (no image)
 http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
 

diff --git a/comps/lvms/llava/dependency/llava_server.py b/comps/lvms/llava/dependency/llava_server.py
@@ -13,6 +13,7 @@
 import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response
+from transformers import AutoProcessor
 from transformers import pipeline
 from transformers.image_utils import load_image
 
@@ -33,9 +34,16 @@ def pipeline_preprocess(self, image, prompt=None, timeout=None):
     The original transformers image-to-text pipeline preprocess function requires that an image is passed in, and will
     fail if the image parameter is null/empty. In order to support multimodal use cases with the same pipeline, this
     preprocess function handles the case where there is no image with the prompt.
+    Also, the image-to-text pipeline typically treats multiple images passed in as a list as a batch (where it iterates
+    over the image inputs for generation). For that reason, the original pipeline_preprocess code would only get a
+    single image at a time. To support multiple images, the pipeline call is updated to send a list of lists for the
+    images (so that when iterated, we still get multiple images) and this pipeline_preprocess function has been updated
+    to handle a list of images in addition to single images.
     """
 
-    if image:
+    if isinstance(image, list):
+        image = [load_image(i, timeout=timeout) for i in image]
+    elif image:
         image = load_image(image, timeout=timeout)
 
     if prompt is not None:
@@ -114,23 +122,52 @@ async def health() -> Response:
 
 
 @app.post("/generate")
-async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now, only accept single image
+async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now
     print("LLaVA generation begin.")
     request_dict = await request.json()
     prompt = request_dict.pop("prompt")
-    img_b64_str = request_dict.pop("img_b64_str")
+    img_b64_str = request_dict.pop("img_b64_str")  # String or list of strings
     max_new_tokens = request_dict.pop("max_new_tokens", 100)
 
+    # Determine the format of the role labels based on the model name
+    model_name = generator.model.name_or_path
+    user_label = "USER:"
+    assistant_label = "ASSISTANT:"
+    image_tag = "<image>\n"
+
+    # This is the role label that we see in the results from the pipeline. This is used to split the output.
+    output_assistant_label = "ASSISTANT: "
+
+    if "llava-interleave" in model_name:
+        user_label = "<|im_start|>user"
+        assistant_label = "<|im_end|><|im_start|>assistant"
+        output_assistant_label = "assistant "
+    elif "llava-v1.6-mistral" in model_name:
+        user_label = "[INST]"
+        assistant_label = " [/INST]"
+        output_assistant_label = "[/INST] "
+
     if img_b64_str:
-        # Decode and Resize the image
-        image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str)))
-        image = process_image(image)
-        # format the prompt with an image
-        prompt = f"<image>\nUSER: {prompt}\nASSISTANT:"
+        if isinstance(img_b64_str, str):
+            img_b64_str = [img_b64_str]
+
+        # Decode and Resize the images
+        images = []
+        for img_b64 in img_b64_str:
+            if img_b64:
+                image = PIL.Image.open(BytesIO(base64.b64decode(img_b64)))
+                image = process_image(image)
+                images.append(image)
+
+        # If the prompt provided does not have all the image tags, format the prompt with images
+        num_images = len(images)
+        num_image_tags = prompt.count(image_tag)
+        image_tags = image_tag * (num_images - num_image_tags) if num_images > num_image_tags else ""
+        prompt = f"{user_label}{image_tags} {prompt}{assistant_label}"
     else:
-        image = None
+        images = None
         # format the prompt with text only
-        prompt = f"USER: {prompt}\nASSISTANT:"
+        prompt = f"{user_label} {prompt}\n{assistant_label}"
 
     if args.device == "hpu":
         generate_kwargs = {
@@ -149,12 +186,13 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
     # Override the pipeline preprocessing
     generator.preprocess = pipeline_preprocess.__get__(generator, type(generator))
 
-    result = generator(image, prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
+    result = generator([images], prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs)
     end = time.time()
-    result = result[0]["generated_text"].split("ASSISTANT: ")[-1]
+    result = result[0][0]["generated_text"].split(output_assistant_label.strip())[-1].strip()
     print(f"LLaVA result = {result}, time = {(end-start) * 1000 }ms")
-    if image:
-        image.close()
+    if images:
+        for i in images:
+            i.close()
 
     ret = {"text": result}
     return JSONResponse(ret)
@@ -191,6 +229,8 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
         device=args.device,
     )
 
+    processor = AutoProcessor.from_pretrained(model_name_or_path)
+
     # warmup
     print("LLaVA warmup...")
     if args.device == "hpu":
@@ -214,10 +254,23 @@ async def generate(request: Request) -> Response:  # FIXME batch_size=1 for now,
     images = []
     for image_path in image_paths:
         images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw))
+
+    # Generate a text prompt to use for warm up
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "What's the content of the image?"},
+                ],
+        },
+    ]
+    text_prompt = processor.apply_chat_template(conversation)
+
     for i in range(args.warmup):
         generator(
             images,
-            prompt="<image>\nUSER: What's the content of the image?\nASSISTANT:",
+            prompt=text_prompt,
             batch_size=1,
             generate_kwargs=generate_kwargs,
         )

diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py
@@ -28,6 +28,9 @@
 logger = CustomLogger("lvm")
 logflag = os.getenv("LOGFLAG", False)
 
+# The maximum number of images that should be sent to the LVM
+max_images = int(os.getenv("MAX_IMAGES", 1))
+
 
 @register_microservice(
     name="opea_service@lvm",
@@ -76,6 +79,17 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
         prompt = request.prompt
         max_new_tokens = request.max_new_tokens
 
+    # Limit the number of images being sent to the LVM
+    if isinstance(img_b64_str, list) and len(img_b64_str) > max_images:
+        img_b64_str=img_b64_str[-max_images:]
+
+        # Adjust the number of images tags in the prompt
+        image_tag = "<image>\n"
+        num_tags_in_prompt = prompt.count(image_tag)
+
+        if len(img_b64_str) < num_tags_in_prompt:
+            prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
+
     inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens}
     # forward to the LLaVA server
     response = requests.post(url=f"{lvm_endpoint}/generate", data=json.dumps(inputs), proxies={"http": None})
@@ -99,5 +113,8 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
 if __name__ == "__main__":
     lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
 
+    if logflag:
+        logger.info(f"MAX_IMAGES: {max_images}")
+
     logger.info("[LVM] LVM initialized.")
     opea_microservices["opea_service@lvm"].start()
diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -27,6 +27,9 @@
 logger = CustomLogger("lvm_tgi")
 logflag = os.getenv("LOGFLAG", False)
 
+# The maximum number of images that should be sent to the LVM
+max_images = int(os.getenv("MAX_IMAGES", 1))
+
 
 @register_microservice(
     name="opea_service@lvm_tgi",
@@ -88,15 +91,41 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
         top_k = request.top_k
         top_p = request.top_p
 
-    if not img_b64_str:
-        # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
-        # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
-        # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
-        img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
-        prompt = f"Please disregard the image and answer the question. {prompt}"
+    # Make img_b64_str into a list of strings (if it's not already a list)
+    if not isinstance(img_b64_str, list):
+        if img_b64_str:
+            img_b64_str = [img_b64_str]
+        else:
+            # If img_b64_str was an empty string, which means we have just have a text prompt.
+            # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
+            # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
+            # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
+            img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"]
+            prompt = f"Please disregard the image and answer the question. {prompt}"
+
+    # Truncate the list of images if we have too many, only sending the most recent ones at the end of the list
+    if len(img_b64_str) > max_images:
+        img_b64_str=img_b64_str[-max_images:]
 
-    image = f"data:image/png;base64,{img_b64_str}"
-    image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
+    # Check the number of image tags in the prompt and adjust them to match the number of images that we have
+    image_tag = "<image>\n"
+    num_tags_in_prompt = prompt.count(image_tag)
+
+    # We have too many image tags in the prompt replace the first x instance of the tag with an empty string
+    if  len(img_b64_str) < num_tags_in_prompt:
+        prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str))
+
+    # We don't have enough image tags in the prompt, add them
+    if len(img_b64_str) > num_tags_in_prompt:
+        num_tags_to_add = len(img_b64_str) - num_tags_in_prompt
+        tags_to_add = image_tag * num_tags_to_add
+        prompt = f"{tags_to_add}{prompt}"
+
+    # Replace image tags with the data
+    for i in img_b64_str:
+        formatted_image_str = f"![](data:image/png;base64,{i})\n"
+        prompt = prompt.replace(image_tag, formatted_image_str, 1)
+    image_prompt = f"{prompt}\nASSISTANT:"
 
     if streaming:
 
@@ -152,4 +181,6 @@ async def stream_generator():
     lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
     lvm_client = AsyncInferenceClient(lvm_endpoint)
     logger.info("[LVM] LVM initialized.")
+    if logflag:
+        logger.info(f"MAX_IMAGES: {max_images}")
     opea_microservices["opea_service@lvm_tgi"].start()
diff --git a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py
@@ -69,6 +69,12 @@ async def retrieve(
     if isinstance(input, EmbedMultimodalDoc):
         metadata_list = []
         for r in search_res:
+            # If the input had an image, pass that through in the metadata along with the search result image
+            if input.base64_image:
+                if r.metadata["b64_img_str"]:
+                    r.metadata["b64_img_str"] = [input.base64_image, r.metadata["b64_img_str"]]
+                else:
+                    r.metadata["b64_img_str"] = input.base64_image
             metadata_list.append(r.metadata)
             retrieved_docs.append(TextDoc(text=r.page_content))
         result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list)

diff --git a/tests/embeddings/test_embeddings_multimodal.sh b/tests/embeddings/test_embeddings_multimodal.sh
@@ -85,6 +85,22 @@ function validate_microservice_image_text_pair_embedding() {
     fi
 }
 
+function validate_microservice_b64_image_text_pair_embedding() {
+    result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \
+        -X POST \
+        -H "Content-Type: application/json" \
+        -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}')
+
+    if [[ $result == *"embedding"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs embedding-multimodal-bridgetower
+        docker logs embedding-multimodal
+        exit 1
+    fi
+}
+
 function validate_microservice() {
     validate_microservice_text_embedding
     validate_microservice_image_text_pair_embedding