diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 8c71086f58..56de4a8c60 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -278,7 +278,7 @@ class GraphDoc(BaseDoc): class LVMDoc(BaseDoc): - image: str + image: Union[str, List[str]] prompt: str max_new_tokens: conint(ge=0, le=1024) = 512 top_k: int = 10 diff --git a/comps/embeddings/multimodal/README.md b/comps/embeddings/multimodal/README.md index c75a60f12a..c839365bcd 100644 --- a/comps/embeddings/multimodal/README.md +++ b/comps/embeddings/multimodal/README.md @@ -170,11 +170,18 @@ docker compose -f docker_compose_multimodal_embedding.yaml up -d **Compute a joint embedding of an image-text pair** +The image can be passed as a URL: ```bash curl -X POST http://0.0.0.0:6600/v1/embeddings \ -H "Content-Type: application/json" \ -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}' ``` +Or as a base64 encoded string: +```bash +curl -X POST http://0.0.0.0:6600/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}' +``` **Compute an embedding of a text** diff --git a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py index fbd972a202..cd052fc288 100644 --- a/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py +++ b/comps/embeddings/multimodal/multimodal_langchain/mm_embedding_mmei.py @@ -7,6 +7,7 @@ import requests from fastapi.responses import JSONResponse +from typing import Union from comps import ( CustomLogger, @@ -38,7 +39,7 @@ output_datatype=EmbedMultimodalDoc, ) @register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"]) -def embedding(input: MultimodalDoc) -> EmbedDoc: +def embedding(input: MultimodalDoc) -> Union[EmbedDoc, EmbedMultimodalDoc]: start = time.time() if logflag: logger.info(input) @@ -48,9 +49,15 @@ def embedding(input: MultimodalDoc) -> EmbedDoc: json["text"] = input.text elif isinstance(input, TextImageDoc): json["text"] = input.text.text - img_bytes = input.image.url.load_bytes() - base64_img = base64.b64encode(img_bytes).decode("utf-8") - json["img_b64_str"] = base64_img + base64_img = "" + if input.image.url: + img_bytes = input.image.url.load_bytes() + base64_img = base64.b64encode(img_bytes).decode("utf-8") + elif input.image.base64_image: + base64_img = input.image.base64_image + + if base64_img: + json["img_b64_str"] = base64_img else: return JSONResponse(status_code=400, content={"message": "Bad request!"}) @@ -66,6 +73,9 @@ def embedding(input: MultimodalDoc) -> EmbedDoc: res = EmbedDoc(text=input.text, embedding=embed_vector) elif isinstance(input, TextImageDoc): res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector) + + if base64_img: + res.base64_image = base64_img except requests.exceptions.ConnectionError: res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"}) statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None) diff --git a/comps/lvms/llava/README.md b/comps/lvms/llava/README.md index 998eb4b664..74e1de706f 100644 --- a/comps/lvms/llava/README.md +++ b/comps/lvms/llava/README.md @@ -1,6 +1,6 @@ # LVM Microservice -Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image. +Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and images. It outputs the answer to the prompt about the images. ## 🚀1. Start Microservice with Python (Option 1) @@ -92,10 +92,15 @@ docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_M #### 2.2.2 Start LVM service +> Note: The `MAX_IMAGES` environment variable is used to specify the maximum number of images that will be sent from the LVM service to the LLaVA server. +> If an image list longer than `MAX_IMAGES` is sent to the LVM server, a shortened image list will be sent to the LLaVA service. If the image list +> needs to be shortened, the most recent images (the ones at the end of the list) are prioritized to send to the LLaVA service. Some LLaVA models have not +> been trained with multiple images and may lead to inaccurate results. If `MAX_IMAGES` is not set, it will default to `1`. + ```bash ip_address=$(hostname -I | awk '{print $1}') -docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm-llava-svc:latest +docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 -e MAX_IMAGES=1 opea/lvm-llava-svc:latest ``` #### 2.2.3 Test @@ -106,6 +111,9 @@ docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$htt # curl with an image and a prompt http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json' +# curl with multiple images and a prompt (Note that depending on your MAX_IMAGES value, both images may not be sent to the LLaVA model) +http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What is in these images?"}' -H 'Content-Type: application/json' + # curl with a prompt only (no image) http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json' diff --git a/comps/lvms/llava/dependency/llava_server.py b/comps/lvms/llava/dependency/llava_server.py index 644e15a82e..4fc0043805 100644 --- a/comps/lvms/llava/dependency/llava_server.py +++ b/comps/lvms/llava/dependency/llava_server.py @@ -13,6 +13,7 @@ import uvicorn from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response +from transformers import AutoProcessor from transformers import pipeline from transformers.image_utils import load_image @@ -33,9 +34,16 @@ def pipeline_preprocess(self, image, prompt=None, timeout=None): The original transformers image-to-text pipeline preprocess function requires that an image is passed in, and will fail if the image parameter is null/empty. In order to support multimodal use cases with the same pipeline, this preprocess function handles the case where there is no image with the prompt. + Also, the image-to-text pipeline typically treats multiple images passed in as a list as a batch (where it iterates + over the image inputs for generation). For that reason, the original pipeline_preprocess code would only get a + single image at a time. To support multiple images, the pipeline call is updated to send a list of lists for the + images (so that when iterated, we still get multiple images) and this pipeline_preprocess function has been updated + to handle a list of images in addition to single images. """ - if image: + if isinstance(image, list): + image = [load_image(i, timeout=timeout) for i in image] + elif image: image = load_image(image, timeout=timeout) if prompt is not None: @@ -114,23 +122,52 @@ async def health() -> Response: @app.post("/generate") -async def generate(request: Request) -> Response: # FIXME batch_size=1 for now, only accept single image +async def generate(request: Request) -> Response: # FIXME batch_size=1 for now print("LLaVA generation begin.") request_dict = await request.json() prompt = request_dict.pop("prompt") - img_b64_str = request_dict.pop("img_b64_str") + img_b64_str = request_dict.pop("img_b64_str") # String or list of strings max_new_tokens = request_dict.pop("max_new_tokens", 100) + # Determine the format of the role labels based on the model name + model_name = generator.model.name_or_path + user_label = "USER:" + assistant_label = "ASSISTANT:" + image_tag = "\n" + + # This is the role label that we see in the results from the pipeline. This is used to split the output. + output_assistant_label = "ASSISTANT: " + + if "llava-interleave" in model_name: + user_label = "<|im_start|>user" + assistant_label = "<|im_end|><|im_start|>assistant" + output_assistant_label = "assistant " + elif "llava-v1.6-mistral" in model_name: + user_label = "[INST]" + assistant_label = " [/INST]" + output_assistant_label = "[/INST] " + if img_b64_str: - # Decode and Resize the image - image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str))) - image = process_image(image) - # format the prompt with an image - prompt = f"\nUSER: {prompt}\nASSISTANT:" + if isinstance(img_b64_str, str): + img_b64_str = [img_b64_str] + + # Decode and Resize the images + images = [] + for img_b64 in img_b64_str: + if img_b64: + image = PIL.Image.open(BytesIO(base64.b64decode(img_b64))) + image = process_image(image) + images.append(image) + + # If the prompt provided does not have all the image tags, format the prompt with images + num_images = len(images) + num_image_tags = prompt.count(image_tag) + image_tags = image_tag * (num_images - num_image_tags) if num_images > num_image_tags else "" + prompt = f"{user_label}{image_tags} {prompt}{assistant_label}" else: - image = None + images = None # format the prompt with text only - prompt = f"USER: {prompt}\nASSISTANT:" + prompt = f"{user_label} {prompt}\n{assistant_label}" if args.device == "hpu": generate_kwargs = { @@ -149,12 +186,13 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now, # Override the pipeline preprocessing generator.preprocess = pipeline_preprocess.__get__(generator, type(generator)) - result = generator(image, prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs) + result = generator([images], prompt=prompt, batch_size=1, generate_kwargs=generate_kwargs) end = time.time() - result = result[0]["generated_text"].split("ASSISTANT: ")[-1] + result = result[0][0]["generated_text"].split(output_assistant_label.strip())[-1].strip() print(f"LLaVA result = {result}, time = {(end-start) * 1000 }ms") - if image: - image.close() + if images: + for i in images: + i.close() ret = {"text": result} return JSONResponse(ret) @@ -191,6 +229,8 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now, device=args.device, ) + processor = AutoProcessor.from_pretrained(model_name_or_path) + # warmup print("LLaVA warmup...") if args.device == "hpu": @@ -214,10 +254,23 @@ async def generate(request: Request) -> Response: # FIXME batch_size=1 for now, images = [] for image_path in image_paths: images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) + + # Generate a text prompt to use for warm up + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": "What's the content of the image?"}, + ], + }, + ] + text_prompt = processor.apply_chat_template(conversation) + for i in range(args.warmup): generator( images, - prompt="\nUSER: What's the content of the image?\nASSISTANT:", + prompt=text_prompt, batch_size=1, generate_kwargs=generate_kwargs, ) diff --git a/comps/lvms/llava/lvm.py b/comps/lvms/llava/lvm.py index 897f7cbbe4..9d7bde0f90 100644 --- a/comps/lvms/llava/lvm.py +++ b/comps/lvms/llava/lvm.py @@ -28,6 +28,9 @@ logger = CustomLogger("lvm") logflag = os.getenv("LOGFLAG", False) +# The maximum number of images that should be sent to the LVM +max_images = int(os.getenv("MAX_IMAGES", 1)) + @register_microservice( name="opea_service@lvm", @@ -76,6 +79,17 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc prompt = request.prompt max_new_tokens = request.max_new_tokens + # Limit the number of images being sent to the LVM + if isinstance(img_b64_str, list) and len(img_b64_str) > max_images: + img_b64_str=img_b64_str[-max_images:] + + # Adjust the number of images tags in the prompt + image_tag = "\n" + num_tags_in_prompt = prompt.count(image_tag) + + if len(img_b64_str) < num_tags_in_prompt: + prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str)) + inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens} # forward to the LLaVA server response = requests.post(url=f"{lvm_endpoint}/generate", data=json.dumps(inputs), proxies={"http": None}) @@ -99,5 +113,8 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc if __name__ == "__main__": lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399") + if logflag: + logger.info(f"MAX_IMAGES: {max_images}") + logger.info("[LVM] LVM initialized.") opea_microservices["opea_service@lvm"].start() diff --git a/comps/lvms/tgi-llava/lvm_tgi.py b/comps/lvms/tgi-llava/lvm_tgi.py index 38b492c395..04ceee400c 100644 --- a/comps/lvms/tgi-llava/lvm_tgi.py +++ b/comps/lvms/tgi-llava/lvm_tgi.py @@ -27,6 +27,9 @@ logger = CustomLogger("lvm_tgi") logflag = os.getenv("LOGFLAG", False) +# The maximum number of images that should be sent to the LVM +max_images = int(os.getenv("MAX_IMAGES", 1)) + @register_microservice( name="opea_service@lvm_tgi", @@ -88,15 +91,41 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc top_k = request.top_k top_p = request.top_p - if not img_b64_str: - # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image. - # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png: - # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png - img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC" - prompt = f"Please disregard the image and answer the question. {prompt}" + # Make img_b64_str into a list of strings (if it's not already a list) + if not isinstance(img_b64_str, list): + if img_b64_str: + img_b64_str = [img_b64_str] + else: + # If img_b64_str was an empty string, which means we have just have a text prompt. + # Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image. + # Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png: + # https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png + img_b64_str = ["iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"] + prompt = f"Please disregard the image and answer the question. {prompt}" + + # Truncate the list of images if we have too many, only sending the most recent ones at the end of the list + if len(img_b64_str) > max_images: + img_b64_str=img_b64_str[-max_images:] - image = f"data:image/png;base64,{img_b64_str}" - image_prompt = f"![]({image})\n{prompt}\nASSISTANT:" + # Check the number of image tags in the prompt and adjust them to match the number of images that we have + image_tag = "\n" + num_tags_in_prompt = prompt.count(image_tag) + + # We have too many image tags in the prompt replace the first x instance of the tag with an empty string + if len(img_b64_str) < num_tags_in_prompt: + prompt = prompt.replace(image_tag, "", num_tags_in_prompt - len(img_b64_str)) + + # We don't have enough image tags in the prompt, add them + if len(img_b64_str) > num_tags_in_prompt: + num_tags_to_add = len(img_b64_str) - num_tags_in_prompt + tags_to_add = image_tag * num_tags_to_add + prompt = f"{tags_to_add}{prompt}" + + # Replace image tags with the data + for i in img_b64_str: + formatted_image_str = f"![](data:image/png;base64,{i})\n" + prompt = prompt.replace(image_tag, formatted_image_str, 1) + image_prompt = f"{prompt}\nASSISTANT:" if streaming: @@ -152,4 +181,6 @@ async def stream_generator(): lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399") lvm_client = AsyncInferenceClient(lvm_endpoint) logger.info("[LVM] LVM initialized.") + if logflag: + logger.info(f"MAX_IMAGES: {max_images}") opea_microservices["opea_service@lvm_tgi"].start() diff --git a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py index a01b3e20c4..a92d59aba2 100644 --- a/comps/retrievers/multimodal/redis/langchain/retriever_redis.py +++ b/comps/retrievers/multimodal/redis/langchain/retriever_redis.py @@ -69,6 +69,12 @@ async def retrieve( if isinstance(input, EmbedMultimodalDoc): metadata_list = [] for r in search_res: + # If the input had an image, pass that through in the metadata along with the search result image + if input.base64_image: + if r.metadata["b64_img_str"]: + r.metadata["b64_img_str"] = [input.base64_image, r.metadata["b64_img_str"]] + else: + r.metadata["b64_img_str"] = input.base64_image metadata_list.append(r.metadata) retrieved_docs.append(TextDoc(text=r.page_content)) result = SearchedMultimodalDoc(retrieved_docs=retrieved_docs, initial_query=input.text, metadata=metadata_list) diff --git a/tests/embeddings/test_embeddings_multimodal.sh b/tests/embeddings/test_embeddings_multimodal.sh index bd2ca93b70..5bb2fd9f93 100644 --- a/tests/embeddings/test_embeddings_multimodal.sh +++ b/tests/embeddings/test_embeddings_multimodal.sh @@ -85,6 +85,22 @@ function validate_microservice_image_text_pair_embedding() { fi } +function validate_microservice_b64_image_text_pair_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"base64_image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC"}}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs embedding-multimodal-bridgetower + docker logs embedding-multimodal + exit 1 + fi +} + function validate_microservice() { validate_microservice_text_embedding validate_microservice_image_text_pair_embedding diff --git a/tests/lvms/test_lvms_llava.sh b/tests/lvms/test_lvms_llava.sh index 4627ec6ee7..8558fa5e3d 100644 --- a/tests/lvms/test_lvms_llava.sh +++ b/tests/lvms/test_lvms_llava.sh @@ -48,6 +48,42 @@ function validate_microservice() { exit 1 fi + # Test sending two images with a text prompt with one image tag in the prompt. + # The first image is green and the second image is blue. Since the default MAX_IMAGES is 1, only the blue image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC"], "prompt":"\nWhat are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"blue"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi + + # Test sending two images with a text prompt without any image tags. + # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"green"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi + + # Same test as above, except including two image tags with the prompt to ensure the number of image tags is reconciled. + # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"\n\nWhat are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"green"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"retrieved_docs": [], "initial_query": "What is this?", "top_n": 1, "metadata": [{"b64_img_str": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "transcript_for_inference": "yellow image", "video_id": "8c7461df-b373-4a00-8696-9a2234359fe0", "time_of_frame_ms":"37000000", "source_video":"WeAreGoingOnBullrun_8c7461df-b373-4a00-8696-9a2234359fe0.mp4"}]}' -H 'Content-Type: application/json') if [[ $result == *"yellow"* ]]; then echo "Result correct." diff --git a/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh b/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh index 1fa0155266..9d1a69a7ae 100644 --- a/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh +++ b/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh @@ -54,6 +54,41 @@ function validate_microservice() { echo "LVM prompt without image - HTTP status (successful)" fi + # Test sending two images with a text prompt with one image tag in the prompt. + # The first image is green and the second image is blue. Since the default MAX_IMAGES is 1, only the blue image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC"], "prompt":"\nWhat are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"blue"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi + + # Test sending two images with a text prompt without any image tags. + # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"What are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"green"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi + + # Same test as above, except including two image tags with the prompt to ensure the number of image tags is reconciled. + # The first image is blue and the second image is green. Since the default MAX_IMAGES is 1, only the green image should be sent to the LVM. + result=$(http_proxy="" curl http://localhost:$lvm_port/v1/lvm -XPOST -d '{"image": ["iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNkYPhfz0AEYBxVSF+FAP5FDvcfRYWgAAAAAElFTkSuQmCC", "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mNk+M9Qz0AEYBxVSF+FAAhKDveksOjmAAAAAElFTkSuQmCC"], "prompt":"\n\nWhat are in these images?"}' -H 'Content-Type: application/json') + if [[ $result == *"green"* ]]; then + echo "Result correct." + else + echo "Result wrong." + docker logs test-comps-lvm-llava >> ${LOG_PATH}/llava-dependency.log + docker logs test-comps-lvm-llava-svc >> ${LOG_PATH}/llava-server.log + exit 1 + fi } function stop_docker() { diff --git a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh index 873516ddc5..06fecec69d 100644 --- a/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh +++ b/tests/retrievers/test_retrievers_multimodal_redis_langchain.sh @@ -58,6 +58,32 @@ function validate_microservice() { docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log exit 1 fi + + # Test the retriever with a b64 image that should be passed through + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding},\"img_b64_str\":\"iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC\"}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content has retrieved_docs as expected." + if echo "$CONTENT" | grep -q "b64_img_str"; then + echo "[ retriever ] Content has b64_img_str as expected." + else + echo "[ retriever ] Content does not include the b64_img_str: $CONTENT" + docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log + exit 1 + fi + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-multimodal-redis >> ${LOG_PATH}/retriever.log + exit 1 + fi } function stop_docker() {