From bb7108d8f7b1abdade516ff0a62c6fb114fd53bb Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Wed, 27 Nov 2024 14:54:45 -0800
Subject: [PATCH 1/5] Initial implementation of PDF ingestion

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 .../redis/langchain/prepare_videodoc_redis.py | 87 ++++++++++++++-----
 .../redis/langchain/requirements.txt          |  1 +
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index fa8ed4896e..780fbd10b1 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -17,6 +17,7 @@
 from langchain_core.utils import get_from_dict_or_env
 from multimodal_utils import (
     clear_upload_folder,
+    convert_img_to_base64,
     convert_video_to_audio,
     create_upload_folder,
     delete_audio_file,
@@ -30,6 +31,7 @@
     write_vtt,
 )
 from PIL import Image
+import pymupdf
 
 from comps import opea_microservices, register_microservice
 from comps.embeddings.multimodal.bridgetower.bridgetower_embedding import BridgeTowerEmbedding
@@ -510,7 +512,7 @@ async def ingest_generate_caption(files: List[UploadFile] = File(None)):
 )
 async def ingest_with_text(files: List[UploadFile] = File(None)):
     if files:
-        accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif"]
+        accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"]
         # Create a lookup dictionary containing all media files
         matched_files = {f.filename: [f] for f in files if os.path.splitext(f.filename)[1] in accepted_media_formats}
         uploaded_files_map = {}
@@ -537,25 +539,25 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
             elif file_extension not in accepted_media_formats:
                 print(f"Skipping file {file.filename} because of unsupported format.")
 
-        # Check if every media file has a caption file
-        for media_file_name, file_pair in matched_files.items():
-            if len(file_pair) != 2:
+        # Check that every media file that is not a pdf has a caption file
+        for media_file_name, file_list in matched_files.items():
+            if len(file_list) != 2 and os.path.splitext(media_file_name)[1] != ".pdf":
                 raise HTTPException(status_code=400, detail=f"No caption file found for {media_file_name}")
 
         if len(matched_files.keys()) == 0:
             return HTTPException(
                 status_code=400,
-                detail="The uploaded files have unsupported formats. Please upload at least one video file (.mp4) with captions (.vtt) or one image (.png, .jpg, .jpeg, or .gif) with caption (.txt)",
+                detail="The uploaded files have unsupported formats. Please upload at least one video file (.mp4) with captions (.vtt) or one image (.png, .jpg, .jpeg, or .gif) with caption (.txt) or one .pdf file",
             )
 
         for media_file in matched_files:
             print(f"Processing file {media_file}")
+            file_name, file_extension = os.path.splitext(media_file)
 
             # Assign unique identifier to file
             file_id = generate_id()
 
             # Create file name by appending identifier
-            file_name, file_extension = os.path.splitext(media_file)
             media_file_name = f"{file_name}_{file_id}{file_extension}"
             media_dir_name = os.path.splitext(media_file_name)[0]
 
@@ -564,28 +566,65 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                 shutil.copyfileobj(matched_files[media_file][0].file, f)
             uploaded_files_map[file_name] = media_file_name
 
-            # Save caption file in upload directory
-            caption_file_extension = os.path.splitext(matched_files[media_file][1].filename)[1]
-            caption_file = f"{media_dir_name}{caption_file_extension}"
-            with open(os.path.join(upload_folder, caption_file), "wb") as f:
-                shutil.copyfileobj(matched_files[media_file][1].file, f)
+            if file_extension == ".pdf":
+                import cv2
+                # Set up location to store frames and annotations
+                output_dir = os.path.join(upload_folder, media_dir_name)
+                os.makedirs(output_dir, exist_ok=True)
+                os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
+                doc = pymupdf.open(os.path.join(upload_folder, media_file_name))
+                annotations = []
+                for page in doc:
+                    text = page.get_text()
+                    blocks = page.get_text("dict")["blocks"]
+                    imgblocks = [b for b in blocks if b["type"] == 1]
+                    for idx, image in enumerate(imgblocks):
+                        # Write image and caption file for each image found in pdf
+                        img_fname = f"frame_{idx}"
+                        img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
+                        cv2.imwrite(img_fpath, imgblocks[0]['image'])
+
+                        # Convert image to base64 encoded string
+                        b64_img_str = convert_img_to_base64(frame)
+
+                        # Create annotations for frame from transcripts
+                        annotations.append(
+                            {
+                                "video_id": file_id,
+                                "video_name": os.path.basename(os.path.join(upload_folder, media_file_name)),
+                                "b64_img_str": b64_img_str,
+                                "caption": text,
+                                "time": 0.0,
+                                "frame_no": 0,
+                                "sub_video_id": idx,
+                            }
+                        )
+
+                with open(os.path.join(output_dir, "annotations.json"), "w") as f:
+                    json.dump(annotations, f)
+            else:
+                # Save caption file in upload directory
+                caption_file_extension = os.path.splitext(matched_files[media_file][1].filename)[1]
+                caption_file = f"{media_dir_name}{caption_file_extension}"
+                with open(os.path.join(upload_folder, caption_file), "wb") as f:
+                    shutil.copyfileobj(matched_files[media_file][1].file, f)
 
-            # Store frames and caption annotations in a new directory
-            extract_frames_and_annotations_from_transcripts(
-                file_id,
-                os.path.join(upload_folder, media_file_name),
-                os.path.join(upload_folder, caption_file),
-                os.path.join(upload_folder, media_dir_name),
-            )
+                # Store frames and caption annotations in a new directory
+                extract_frames_and_annotations_from_transcripts(
+                    file_id,
+                    os.path.join(upload_folder, media_file_name),
+                    os.path.join(upload_folder, caption_file),
+                    os.path.join(upload_folder, media_dir_name),
+                )
 
-            # Delete temporary caption file
-            os.remove(os.path.join(upload_folder, caption_file))
+                # Delete temporary caption file
+                os.remove(os.path.join(upload_folder, caption_file))
 
-            # Ingest multimodal data into redis
-            ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings)
+                # Ingest multimodal data into redis
+                ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings)
 
-            # Delete temporary media directory containing frames and annotations
-            shutil.rmtree(os.path.join(upload_folder, media_dir_name))
+                # Delete temporary media directory containing frames and annotations
+                shutil.rmtree(os.path.join(upload_folder, media_dir_name))
 
             print(f"Processed file {media_file}")
 
diff --git a/comps/dataprep/multimodal/redis/langchain/requirements.txt b/comps/dataprep/multimodal/redis/langchain/requirements.txt
index b368bb2336..55c8ba3c87 100644
--- a/comps/dataprep/multimodal/redis/langchain/requirements.txt
+++ b/comps/dataprep/multimodal/redis/langchain/requirements.txt
@@ -11,6 +11,7 @@ opentelemetry-sdk
 Pillow
 prometheus-fastapi-instrumentator
 pydantic
+pymupdf
 python-multipart
 redis
 shortuuid

From 69da135f3fc9feb0f7714e3092d8f90a6a5146b5 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Mon, 2 Dec 2024 14:56:28 -0800
Subject: [PATCH 2/5] PDF ingestion fixes

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 .../redis/langchain/prepare_videodoc_redis.py | 82 +++++++++++++++----
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index 780fbd10b1..6a9fcb8f0d 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -1,6 +1,8 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import base64
+import json
 import os
 import shutil
 import time
@@ -303,7 +305,46 @@ def prepare_data_and_metadata_from_annotation(
     return text_list, image_list, metadatas
 
 
-def ingest_multimodal(videoname, data_folder, embeddings):
+def prepare_pdf_data_from_annotation(
+    annotation, path_to_frames, title):
+    text_list = []
+    image_list = []
+    metadatas = []
+    for i, frame in enumerate(annotation):
+        page_index = frame["frame_no"]
+        image_index = frame["sub_video_id"]
+        path_to_frame = os.path.join(path_to_frames, f"page{page_index}_image{image_index}.png")
+        caption_for_ingesting = frame["caption"]
+        caption_for_inference = frame["caption"]
+
+        video_id = frame["video_id"]
+        b64_img_str = frame["b64_img_str"]
+        time_of_frame = frame["time"]
+        embedding_type = "pair" if b64_img_str else "text"
+        source_video = frame["video_name"]
+
+        text_list.append(caption_for_ingesting)
+
+        if b64_img_str:
+            image_list.append(path_to_frame)
+
+        metadatas.append(
+            {
+                "content": caption_for_ingesting,
+                "b64_img_str": b64_img_str,
+                "video_id": video_id,
+                "source_video": source_video,
+                "time_of_frame_ms": page_index,  # For PDFs save the page number
+                "embedding_type": embedding_type,
+                "title": title,
+                "transcript_for_inference": caption_for_inference,
+            }
+        )
+
+    return text_list, image_list, metadatas
+
+
+def ingest_multimodal(videoname, data_folder, embeddings, is_pdf=False):
     """Ingest text image pairs to Redis from the data/ directory that consists of frames and annotations."""
     data_folder = os.path.abspath(data_folder)
     annotation_file_path = os.path.join(data_folder, "annotations.json")
@@ -312,7 +353,10 @@ def ingest_multimodal(videoname, data_folder, embeddings):
     annotation = load_json_file(annotation_file_path)
 
     # prepare data to ingest
-    text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, videoname)
+    if is_pdf:
+        text_list, image_list, metadatas = prepare_pdf_data_from_annotation(annotation, path_to_frames, videoname)
+    else:
+        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, videoname)
 
     MultimodalRedis.from_text_image_pairs_return_keys(
         texts=[f"From {videoname}. " + text for text in text_list],
@@ -567,41 +611,49 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
             uploaded_files_map[file_name] = media_file_name
 
             if file_extension == ".pdf":
-                import cv2
                 # Set up location to store frames and annotations
                 output_dir = os.path.join(upload_folder, media_dir_name)
                 os.makedirs(output_dir, exist_ok=True)
                 os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
                 doc = pymupdf.open(os.path.join(upload_folder, media_file_name))
                 annotations = []
-                for page in doc:
+                for page_idx, page in enumerate(doc, start=1):
                     text = page.get_text()
-                    blocks = page.get_text("dict")["blocks"]
-                    imgblocks = [b for b in blocks if b["type"] == 1]
-                    for idx, image in enumerate(imgblocks):
+                    images = page.get_images()
+                    for image_idx, image in enumerate(images, start=1):
                         # Write image and caption file for each image found in pdf
-                        img_fname = f"frame_{idx}"
+                        img_fname = f"page{page_idx}_image{image_idx}"
                         img_fpath = os.path.join(output_dir, "frames", img_fname + ".png")
-                        cv2.imwrite(img_fpath, imgblocks[0]['image'])
+                        pix = pymupdf.Pixmap(doc, image[0])  # create pixmap
+
+                        if pix.n - pix.alpha > 3:  # if CMYK, convert to RGB first
+                            pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+
+                        pix.save(img_fpath)
+                        pix = None
 
                         # Convert image to base64 encoded string
-                        b64_img_str = convert_img_to_base64(frame)
+                        with open(img_fpath, "rb") as image2str: 
+                            encoded_string = base64.b64encode(image2str.read())
 
                         # Create annotations for frame from transcripts
                         annotations.append(
                             {
                                 "video_id": file_id,
                                 "video_name": os.path.basename(os.path.join(upload_folder, media_file_name)),
-                                "b64_img_str": b64_img_str,
+                                "b64_img_str": encoded_string.decode(),
                                 "caption": text,
                                 "time": 0.0,
-                                "frame_no": 0,
-                                "sub_video_id": idx,
+                                "frame_no": page_idx,
+                                "sub_video_id": image_idx,
                             }
                         )
 
                 with open(os.path.join(output_dir, "annotations.json"), "w") as f:
                     json.dump(annotations, f)
+
+                # Ingest multimodal data into redis
+                ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings, is_pdf=True)
             else:
                 # Save caption file in upload directory
                 caption_file_extension = os.path.splitext(matched_files[media_file][1].filename)[1]
@@ -623,8 +675,8 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                 # Ingest multimodal data into redis
                 ingest_multimodal(file_name, os.path.join(upload_folder, media_dir_name), embeddings)
 
-                # Delete temporary media directory containing frames and annotations
-                shutil.rmtree(os.path.join(upload_folder, media_dir_name))
+            # Delete temporary media directory containing frames and annotations
+            shutil.rmtree(os.path.join(upload_folder, media_dir_name))
 
             print(f"Processed file {media_file}")
 

From 4fba10501c0380b7b627e675c144fe8131f0f3e9 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Thu, 12 Dec 2024 16:01:11 -0800
Subject: [PATCH 3/5] Adds a test for dataprep microservice

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 ...est_dataprep_multimodal_redis_langchain.sh | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
index 664f72c628..f35df04681 100644
--- a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
+++ b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
@@ -20,6 +20,8 @@ audio_fn="${tmp_dir}/${audio_name}.wav"
 image_name="apple"
 image_fn="${tmp_dir}/${image_name}.png"
 caption_fn="${tmp_dir}/${image_name}.txt"
+pdf_name="nke-10k-2023"
+pdf_fn="${tmp_dir}/${pdf_name}.pdf"
 
 function build_docker_images() {
     cd $WORKPATH
@@ -132,6 +134,9 @@ tire.""" > ${transcript_fn}
     echo "Downloading Audio"
     wget https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav -O ${audio_fn}
 
+    echo "Downloading PDF"
+    wget https://raw.githubusercontent.com/opea-project/GenAIComps/main/comps/retrievers/redis/data/nke-10k-2023.pdf -O ${pdf_fn}
+
 }
 
 function validate_microservice() {
@@ -256,6 +261,30 @@ function validate_microservice() {
         echo "[ $SERVICE_NAME ] Content is as expected."
     fi
 
+    # test v1/ingest_with_text with a PDF file
+    echo "Testing ingest_with_text API with a PDF file"
+    URL="http://${ip_address}:$dataprep_service_port/v1/ingest_with_text"
+
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F "files=@$pdf_fn" -H 'Content-Type: multipart/form-data' "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+    SERVICE_NAME="dataprep - upload - file"
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+    if [[ "$RESPONSE_BODY" != *"Data preparation succeeded"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_upload_file.log
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
+
     # test v1/generate_captions upload video file
     echo "Testing generate_captions API with video"
     URL="http://${ip_address}:$dataprep_service_port/v1/generate_captions"

From 71dcd6675ff348e3e607a868661ca30c3575e38c Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Fri, 13 Dec 2024 14:59:36 -0800
Subject: [PATCH 4/5] Improved comments, variable name, and a docstring

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 .../redis/langchain/prepare_videodoc_redis.py | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index 6a9fcb8f0d..ef2828ea7b 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -305,12 +305,20 @@ def prepare_data_and_metadata_from_annotation(
     return text_list, image_list, metadatas
 
 
-def prepare_pdf_data_from_annotation(
-    annotation, path_to_frames, title):
+def prepare_pdf_data_from_annotation(annotation, path_to_frames, title):
+    """PDF data processing has some key differences from videos and images.
+    
+    1. Neighboring frames' transcripts are not currently considered relevant.
+       We are only taking the text located on the same page as the image.
+    2. The images/frames are indexed differently, by page and image-within-page
+       indices, as opposed to a single frame index.
+    3. Instead of time of frame in ms, we return the PDF page index through
+       the pre-existing time_of_frame_ms metadata key to maintain compatibility.
+    """
     text_list = []
     image_list = []
     metadatas = []
-    for i, frame in enumerate(annotation):
+    for frame in annotation:
         page_index = frame["frame_no"]
         image_index = frame["sub_video_id"]
         path_to_frame = os.path.join(path_to_frames, f"page{page_index}_image{image_index}.png")
@@ -319,7 +327,6 @@ def prepare_pdf_data_from_annotation(
 
         video_id = frame["video_id"]
         b64_img_str = frame["b64_img_str"]
-        time_of_frame = frame["time"]
         embedding_type = "pair" if b64_img_str else "text"
         source_video = frame["video_name"]
 
@@ -344,7 +351,7 @@ def prepare_pdf_data_from_annotation(
     return text_list, image_list, metadatas
 
 
-def ingest_multimodal(videoname, data_folder, embeddings, is_pdf=False):
+def ingest_multimodal(filename, data_folder, embeddings, is_pdf=False):
     """Ingest text image pairs to Redis from the data/ directory that consists of frames and annotations."""
     data_folder = os.path.abspath(data_folder)
     annotation_file_path = os.path.join(data_folder, "annotations.json")
@@ -354,12 +361,12 @@ def ingest_multimodal(videoname, data_folder, embeddings, is_pdf=False):
 
     # prepare data to ingest
     if is_pdf:
-        text_list, image_list, metadatas = prepare_pdf_data_from_annotation(annotation, path_to_frames, videoname)
+        text_list, image_list, metadatas = prepare_pdf_data_from_annotation(annotation, path_to_frames, filename)
     else:
-        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, videoname)
+        text_list, image_list, metadatas = prepare_data_and_metadata_from_annotation(annotation, path_to_frames, filename)
 
     MultimodalRedis.from_text_image_pairs_return_keys(
-        texts=[f"From {videoname}. " + text for text in text_list],
+        texts=[f"From {filename}. " + text for text in text_list],
         images=image_list,
         embedding=embeddings,
         metadatas=metadatas,
@@ -611,7 +618,7 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
             uploaded_files_map[file_name] = media_file_name
 
             if file_extension == ".pdf":
-                # Set up location to store frames and annotations
+                # Set up location to store pdf images and text, reusing "frames" and "annotations" from video
                 output_dir = os.path.join(upload_folder, media_dir_name)
                 os.makedirs(output_dir, exist_ok=True)
                 os.makedirs(os.path.join(output_dir, "frames"), exist_ok=True)
@@ -636,7 +643,7 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                         with open(img_fpath, "rb") as image2str: 
                             encoded_string = base64.b64encode(image2str.read())
 
-                        # Create annotations for frame from transcripts
+                        # Create annotations file, reusing metadata keys from video
                         annotations.append(
                             {
                                 "video_id": file_id,

From 1d38290c314fd308f199c54dbcaa2504f2659417 Mon Sep 17 00:00:00 2001
From: Melanie Buehler <melanie.h.buehler@intel.com>
Date: Wed, 18 Dec 2024 11:58:47 -0800
Subject: [PATCH 5/5] Updated for review feedback

Signed-off-by: Melanie Buehler <melanie.h.buehler@intel.com>
---
 comps/dataprep/multimodal/redis/langchain/README.md      | 7 +++++--
 .../multimodal/redis/langchain/prepare_videodoc_redis.py | 9 +++++----
 .../dataprep/test_dataprep_multimodal_redis_langchain.sh | 2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/comps/dataprep/multimodal/redis/langchain/README.md b/comps/dataprep/multimodal/redis/langchain/README.md
index db24b431fd..5c21e16c25 100644
--- a/comps/dataprep/multimodal/redis/langchain/README.md
+++ b/comps/dataprep/multimodal/redis/langchain/README.md
@@ -5,6 +5,7 @@ This `dataprep` microservice accepts the following from the user and ingests the
 - Videos (mp4 files) and their transcripts (optional)
 - Images (gif, jpg, jpeg, and png files) and their captions (optional)
 - Audio (wav files)
+- PDFs (with text and images)
 
 ## 🚀1. Start Microservice with Python（Option 1）
 
@@ -111,18 +112,19 @@ docker container logs -f dataprep-multimodal-redis
 
 ## 🚀4. Consume Microservice
 
-Once this dataprep microservice is started, user can use the below commands to invoke the microservice to convert images and videos and their transcripts (optional) to embeddings and save to the Redis vector store.
+Once this dataprep microservice is started, user can use the below commands to invoke the microservice to convert images, videos, text, and PDF files to embeddings and save to the Redis vector store.
 
 This microservice provides 3 different ways for users to ingest files into Redis vector store corresponding to the 3 use cases.
 
 ### 4.1 Consume _ingest_with_text_ API
 
-**Use case:** This API is used when videos are accompanied by transcript files (`.vtt` format) or images are accompanied by text caption files (`.txt` format).
+**Use case:** This API is used for videos accompanied by transcript files (`.vtt` format), images accompanied by text caption files (`.txt` format), and PDF files containing a mix of text and images.
 
 **Important notes:**
 
 - Make sure the file paths after `files=@` are correct.
 - Every transcript or caption file's name must be identical to its corresponding video or image file's name (except their extension - .vtt goes with .mp4 and .txt goes with .jpg, .jpeg, .png, or .gif). For example, `video1.mp4` and `video1.vtt`. Otherwise, if `video1.vtt` is not included correctly in the API call, the microservice will return an error `No captions file video1.vtt found for video1.mp4`.
+- It is assumed that PDFs will contain at least one image. Each image in the file will be embedded along with the text that appears on the same page as the image.
 
 #### Single video-transcript pair upload
 
@@ -157,6 +159,7 @@ curl -X POST \
     -F "files=@./image1.txt" \
     -F "files=@./image2.jpg" \
     -F "files=@./image2.txt" \
+    -F "files=@./example.pdf" \
     http://localhost:6007/v1/ingest_with_text
 ```
 
diff --git a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
index ef2828ea7b..0c274b82f0 100644
--- a/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
+++ b/comps/dataprep/multimodal/redis/langchain/prepare_videodoc_redis.py
@@ -19,7 +19,6 @@
 from langchain_core.utils import get_from_dict_or_env
 from multimodal_utils import (
     clear_upload_folder,
-    convert_img_to_base64,
     convert_video_to_audio,
     create_upload_folder,
     delete_audio_file,
@@ -636,19 +635,21 @@ async def ingest_with_text(files: List[UploadFile] = File(None)):
                         if pix.n - pix.alpha > 3:  # if CMYK, convert to RGB first
                             pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
 
-                        pix.save(img_fpath)
+                        pix.save(img_fpath)  # pixmap to png
                         pix = None
 
                         # Convert image to base64 encoded string
                         with open(img_fpath, "rb") as image2str: 
-                            encoded_string = base64.b64encode(image2str.read())
+                            encoded_string = base64.b64encode(image2str.read())  # png to bytes
+
+                        decoded_string = encoded_string.decode()  # bytes to string
 
                         # Create annotations file, reusing metadata keys from video
                         annotations.append(
                             {
                                 "video_id": file_id,
                                 "video_name": os.path.basename(os.path.join(upload_folder, media_file_name)),
-                                "b64_img_str": encoded_string.decode(),
+                                "b64_img_str": decoded_string,
                                 "caption": text,
                                 "time": 0.0,
                                 "frame_no": page_idx,
diff --git a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
index f35df04681..b19cee10a7 100644
--- a/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
+++ b/tests/dataprep/test_dataprep_multimodal_redis_langchain.sh
@@ -348,7 +348,7 @@ function validate_microservice() {
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
     fi
-    if [[ "$RESPONSE_BODY" != *${image_name}* || "$RESPONSE_BODY" != *${video_name}* || "$RESPONSE_BODY" != *${audio_name}* ]]; then
+    if [[ "$RESPONSE_BODY" != *${image_name}* || "$RESPONSE_BODY" != *${video_name}* || "$RESPONSE_BODY" != *${audio_name}* || "$RESPONSE_BODY" != *${pdf_name}* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
         docker logs test-comps-dataprep-multimodal-redis >> ${LOG_PATH}/dataprep_file.log
         exit 1