From 7aa97a1422ba34e3378ed24886144039d349f3ac Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 18:53:51 -0400 Subject: [PATCH 01/36] new: `DataprepRequest` --- comps/cores/proto/api_protocol.py | 14 +++++++ .../src/integrations/elasticsearch.py | 19 +++++----- comps/dataprep/src/integrations/milvus.py | 17 +++++---- .../src/integrations/milvus_multimodal.py | 4 +- .../src/integrations/neo4j_langchain.py | 17 +++++---- .../src/integrations/neo4j_llamaindex.py | 18 +++++---- comps/dataprep/src/integrations/opensearch.py | 17 +++++---- comps/dataprep/src/integrations/pgvect.py | 13 +++---- comps/dataprep/src/integrations/pipecone.py | 17 +++++---- comps/dataprep/src/integrations/qdrant.py | 17 +++++---- comps/dataprep/src/integrations/redis.py | 19 +++++----- .../src/integrations/redis_finance.py | 13 +++---- .../src/integrations/redis_multimodal.py | 6 ++- comps/dataprep/src/integrations/vdms.py | 17 +++++---- .../src/integrations/vdms_multimodal.py | 3 +- .../src/opea_dataprep_microservice.py | 38 +++---------------- 16 files changed, 123 insertions(+), 126 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index dc86746c19..7b16a9c8d8 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -79,6 +79,20 @@ class TokenCheckResponseItem(BaseModel): class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] +class DataprepRequest(BaseModel): + files: Optional[Union[UploadFile, List[UploadFile]]] = None + link_list: Optional[str] = None + chunk_size: int = 1500 + chunk_overlap: int = 100 + process_table: bool = False + table_strategy: str = "fast" + ingest_from_graphDB: bool = False + +class Neo4jDataprepRequest(DataprepRequest): + ingest_from_graphDB: bool = False + +class RedisDataprepRequest(DataprepRequest): + index_name: Optional[str] = None class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation diff --git a/comps/dataprep/src/integrations/elasticsearch.py b/comps/dataprep/src/integrations/elasticsearch.py index aaa1c9f1e2..e922413268 100644 --- a/comps/dataprep/src/integrations/elasticsearch.py +++ b/comps/dataprep/src/integrations/elasticsearch.py @@ -15,6 +15,7 @@ from langchain_huggingface import HuggingFaceEmbeddings from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -237,16 +238,7 @@ async def ingest_link_to_elastic(self, link_list: List[str]) -> None: if logflag: logger.info(f"Processed batch {i // batch_size + 1}/{(num_chunks - 1) // batch_size + 1}") - async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into ElasticSearch database. Save in the format of vector[768]. @@ -259,6 +251,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index f5957d320b..c8ff0cec8a 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -17,6 +17,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -245,14 +246,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into milvus database. @@ -266,6 +260,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ milvus ingest ] files:{files}") logger.info(f"[ milvus ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/milvus_multimodal.py b/comps/dataprep/src/integrations/milvus_multimodal.py index cd8f40e93a..336c0cebc5 100644 --- a/comps/dataprep/src/integrations/milvus_multimodal.py +++ b/comps/dataprep/src/integrations/milvus_multimodal.py @@ -21,6 +21,7 @@ from PIL import Image from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( @@ -591,7 +592,8 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): + files = input.files if logflag: logger.info(f"[ milvus ingest ] files:{files}") diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index c51dac7996..8b6d411162 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -16,6 +16,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -138,14 +139,7 @@ async def ingest_data_to_neo4j(self, doc_path: DocPath): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into Neo4j database. @@ -159,6 +153,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 85e3720598..53696e5680 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -36,6 +36,7 @@ from transformers import AutoTokenizer from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import Neo4jDataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -665,14 +666,7 @@ async def build_communities(self, index: PropertyGraphIndex): return False async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: Neo4jDataprepRequest, ): """Ingest files/links content into Neo4j database. @@ -687,6 +681,14 @@ async def ingest_files( table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + ingest_from_graphDB = input.ingest_from_graphDB + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index b06ced2877..44e3212049 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -14,6 +14,7 @@ from opensearchpy import OpenSearch from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -321,14 +322,7 @@ def search_all_documents(self, index_name, offset, search_batch_size): return None async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into opensearch database. @@ -342,6 +336,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ upload ] files:{files}") logger.info(f"[ upload ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index 61965427a8..f2484e3ca5 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -15,6 +15,7 @@ from langchain_huggingface import HuggingFaceEmbeddings from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -244,14 +245,7 @@ async def ingest_link_to_pgvector(self, link_list: List[str]): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into pgvector database. @@ -265,6 +259,9 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index dcdd16e95e..af843f2fa3 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -15,6 +15,7 @@ from pinecone import Pinecone, ServerlessSpec from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -230,14 +231,7 @@ async def ingest_link_to_pinecone(self, link_list: List[str], chunk_size, chunk_ return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into pipecone database. @@ -251,6 +245,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index af7c572649..840b8c461e 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -14,6 +14,7 @@ from qdrant_client import QdrantClient from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -152,14 +153,7 @@ async def ingest_data_to_qdrant(self, doc_path: DocPath): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into qdrant database. @@ -173,6 +167,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 07485ebcef..52d568a2d1 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -21,6 +21,7 @@ from redis.commands.search.indexDefinition import IndexDefinition, IndexType from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import RedisDataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -355,15 +356,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - index_name: str = Form(None), + self, input: RedisDataprepRequest ): """Ingest files/links content into redis database. @@ -378,6 +371,14 @@ async def ingest_files( table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). index_name (str, optional): The name of the index where data will be ingested. """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + index_name = input.index_name + if logflag: logger.info(f"[ redis ingest ] files:{files}") logger.info(f"[ redis ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index d3fc50c9b1..d3eaa556c7 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -15,6 +15,7 @@ from comps import OpeaComponent, OpeaComponentRegistry, ServiceType from comps.dataprep.src.integrations.utils.redis_finance_utils import * from comps.dataprep.src.integrations.utils.redis_kv import RedisKVStore +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import encode_filename, save_content_to_local_disk logflag = os.getenv("LOGFLAG", False) @@ -221,14 +222,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into redis database. @@ -242,6 +236,9 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + if logflag: logger.info(f"[ redis ingest ] files:{files}") logger.info(f"[ redis ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index 713db0bac5..ee8547564f 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -21,6 +21,8 @@ from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding +from comps.cores.proto.api_protocol import DataprepRequest + from .utils.multimodal import ( clear_upload_folder, @@ -651,7 +653,9 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): + files = input.files + if files: accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"] # Create a lookup dictionary containing all media files diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 5f389438af..9fb5f67db2 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -13,6 +13,7 @@ from langchain_vdms.vectorstores import VDMS, VDMS_Client from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -145,14 +146,7 @@ async def ingest_data_to_vdms(self, doc_path: DocPath): logger.info(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into VDMS database. @@ -166,6 +160,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ upload ] files:{files}") logger.info(f"[ upload ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/vdms_multimodal.py b/comps/dataprep/src/integrations/vdms_multimodal.py index a17030dfec..00b72e0eeb 100644 --- a/comps/dataprep/src/integrations/vdms_multimodal.py +++ b/comps/dataprep/src/integrations/vdms_multimodal.py @@ -15,6 +15,7 @@ from tqdm import tqdm from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from .utils import store_embeddings from .utils.utils import process_all_videos, read_config @@ -194,7 +195,7 @@ async def ingest_generate_transcripts(self, files: List[UploadFile] = File(None) async def ingest_generate_caption(self, files: List[UploadFile] = File(None)): pass - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): pass async def get_files(self): diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index ac5c5443ad..5d1b149432 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -28,6 +28,7 @@ statistics_dict, ) from comps.dataprep.src.utils import create_upload_folder +from comps.cores.proto.api_protocol import DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest logger = CustomLogger("opea_dataprep_microservice") logflag = os.getenv("LOGFLAG", False) @@ -49,45 +50,18 @@ port=5000, ) @register_statistics(names=["opea_service@dataprep"]) -async def ingest_files( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - index_name: Optional[str] = Form(None), -): +async def ingest_files(input: Union[DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest]): start = time.time() + files = input.files + link_list = input.link_list + if logflag: logger.info(f"[ ingest ] files:{files}") logger.info(f"[ ingest ] link_list:{link_list}") try: - # Use the loader to invoke the component - if dataprep_component_name == "OPEA_DATAPREP_REDIS": - response = await loader.ingest_files( - files, - link_list, - chunk_size, - chunk_overlap, - process_table, - table_strategy, - ingest_from_graphDB, - index_name, - ) - else: - if index_name: - logger.error( - 'Error during dataprep ingest invocation: "index_name" option is supported if "DATAPREP_COMPONENT_NAME" environment variable is set to "OPEA_DATAPREP_REDIS". i.e: export DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_REDIS"' - ) - raise - - response = await loader.ingest_files( - files, link_list, chunk_size, chunk_overlap, process_table, table_strategy, ingest_from_graphDB - ) + response = await loader.ingest_files(input) # Log the result if logging is enabled if logflag: From 75f4ecd7bf28b118bac358256b31bef31602c517 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Apr 2025 22:56:10 +0000 Subject: [PATCH 02/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 4 ++++ comps/dataprep/src/integrations/milvus.py | 4 +--- comps/dataprep/src/integrations/neo4j_langchain.py | 4 +--- comps/dataprep/src/integrations/neo4j_llamaindex.py | 3 ++- comps/dataprep/src/integrations/opensearch.py | 3 ++- comps/dataprep/src/integrations/pgvect.py | 3 ++- comps/dataprep/src/integrations/pipecone.py | 4 +--- comps/dataprep/src/integrations/qdrant.py | 3 ++- comps/dataprep/src/integrations/redis.py | 4 +--- comps/dataprep/src/integrations/redis_finance.py | 5 +++-- comps/dataprep/src/integrations/redis_multimodal.py | 3 +-- comps/dataprep/src/integrations/vdms.py | 3 ++- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 7b16a9c8d8..1b84dd55d6 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -79,6 +79,7 @@ class TokenCheckResponseItem(BaseModel): class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] + class DataprepRequest(BaseModel): files: Optional[Union[UploadFile, List[UploadFile]]] = None link_list: Optional[str] = None @@ -88,12 +89,15 @@ class DataprepRequest(BaseModel): table_strategy: str = "fast" ingest_from_graphDB: bool = False + class Neo4jDataprepRequest(DataprepRequest): ingest_from_graphDB: bool = False + class RedisDataprepRequest(DataprepRequest): index_name: Optional[str] = None + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index c8ff0cec8a..68080736aa 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -245,9 +245,7 @@ def check_health(self) -> bool: def invoke(self, *args, **kwargs): pass - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into milvus database. Save in the format of vector[], the vector length depends on the emedding model type. diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index 8b6d411162..033b4a472e 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -138,9 +138,7 @@ async def ingest_data_to_neo4j(self, doc_path: DocPath): return True - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into Neo4j database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 53696e5680..adcbb48744 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -666,7 +666,8 @@ async def build_communities(self, index: PropertyGraphIndex): return False async def ingest_files( - self, input: Neo4jDataprepRequest, + self, + input: Neo4jDataprepRequest, ): """Ingest files/links content into Neo4j database. diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index 44e3212049..8aeb66ef44 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -322,7 +322,8 @@ def search_all_documents(self, index_name, offset, search_batch_size): return None async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into opensearch database. diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index f2484e3ca5..276496eeb0 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -245,7 +245,8 @@ async def ingest_link_to_pgvector(self, link_list: List[str]): return True async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into pgvector database. diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index af843f2fa3..3a06845211 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -230,9 +230,7 @@ async def ingest_link_to_pinecone(self, link_list: List[str], chunk_size, chunk_ return True - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into pipecone database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index 840b8c461e..4b633d86e0 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -153,7 +153,8 @@ async def ingest_data_to_qdrant(self, doc_path: DocPath): return True async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into qdrant database. diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 52d568a2d1..ffd233ea95 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -355,9 +355,7 @@ async def check_health(self) -> bool: def invoke(self, *args, **kwargs): pass - async def ingest_files( - self, input: RedisDataprepRequest - ): + async def ingest_files(self, input: RedisDataprepRequest): """Ingest files/links content into redis database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index d3eaa556c7..04b8286f67 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -13,9 +13,9 @@ from langchain_community.vectorstores import Redis from comps import OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.integrations.utils.redis_finance_utils import * from comps.dataprep.src.integrations.utils.redis_kv import RedisKVStore -from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import encode_filename, save_content_to_local_disk logflag = os.getenv("LOGFLAG", False) @@ -222,7 +222,8 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into redis database. diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index ee8547564f..efa40e2b5e 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -20,9 +20,8 @@ from PIL import Image from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from comps.cores.proto.api_protocol import DataprepRequest - +from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( clear_upload_folder, diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 9fb5f67db2..349734605f 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -146,7 +146,8 @@ async def ingest_data_to_vdms(self, doc_path: DocPath): logger.info(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into VDMS database. diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 5d1b149432..34ceae74df 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -27,8 +27,8 @@ register_statistics, statistics_dict, ) +from comps.cores.proto.api_protocol import DataprepRequest, Neo4jDataprepRequest, RedisDataprepRequest from comps.dataprep.src.utils import create_upload_folder -from comps.cores.proto.api_protocol import DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest logger = CustomLogger("opea_dataprep_microservice") logflag = os.getenv("LOGFLAG", False) From c10a0135aa3b45c08cffe919b4b01ba05556a5d4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 19:04:20 -0400 Subject: [PATCH 03/36] fix: docstrings --- comps/dataprep/src/integrations/elasticsearch.py | 13 +++++++------ comps/dataprep/src/integrations/milvus.py | 13 +++++++------ .../dataprep/src/integrations/neo4j_langchain.py | 13 +++++++------ .../dataprep/src/integrations/neo4j_llamaindex.py | 15 ++++++++------- comps/dataprep/src/integrations/opensearch.py | 13 +++++++------ comps/dataprep/src/integrations/pgvect.py | 13 +++++++------ comps/dataprep/src/integrations/pipecone.py | 13 +++++++------ comps/dataprep/src/integrations/qdrant.py | 13 +++++++------ comps/dataprep/src/integrations/redis.py | 15 ++++++++------- comps/dataprep/src/integrations/redis_finance.py | 13 +++++++------ comps/dataprep/src/integrations/vdms.py | 13 +++++++------ 11 files changed, 79 insertions(+), 68 deletions(-) diff --git a/comps/dataprep/src/integrations/elasticsearch.py b/comps/dataprep/src/integrations/elasticsearch.py index e922413268..fb9d07e8de 100644 --- a/comps/dataprep/src/integrations/elasticsearch.py +++ b/comps/dataprep/src/integrations/elasticsearch.py @@ -244,12 +244,13 @@ async def ingest_files(self, input: DataprepRequest): Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index c8ff0cec8a..bebf4dcfeb 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -253,12 +253,13 @@ async def ingest_files( Save in the format of vector[], the vector length depends on the emedding model type. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index 8b6d411162..b5e0c58097 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -146,12 +146,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 53696e5680..cdfbe1347f 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -673,13 +673,14 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). - ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. + input (Neo4jDataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index 44e3212049..f34b4d7695 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -329,12 +329,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index f2484e3ca5..5e8e91037d 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -252,12 +252,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index af843f2fa3..ecc1753a76 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -238,12 +238,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index 840b8c461e..fa2acfe0f6 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -160,12 +160,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 52d568a2d1..3b054b1b69 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -363,13 +363,14 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). - index_name (str, optional): The name of the index where data will be ingested. + input (RedisDataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + index_name (str, optional): The name of the index where data will be ingested. """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index d3eaa556c7..5b1eb79f87 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -229,12 +229,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 9fb5f67db2..0f69c4aa34 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -153,12 +153,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list From f08b6c048c7d293a0de4e3a7bdb34da044f00bea Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 19:24:40 -0400 Subject: [PATCH 04/36] rem: `ingest_from_graphDB` --- comps/cores/proto/api_protocol.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 1b84dd55d6..fb2e63b32d 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -87,7 +87,6 @@ class DataprepRequest(BaseModel): chunk_overlap: int = 100 process_table: bool = False table_strategy: str = "fast" - ingest_from_graphDB: bool = False class Neo4jDataprepRequest(DataprepRequest): From 7eabcfaa8cb5acbf84dc8b8c042325212283b478 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 7 Apr 2025 19:27:17 -0400 Subject: [PATCH 05/36] new: dep injection --- comps/cores/proto/api_protocol.py | 46 ++++++++++++++----- .../src/opea_dataprep_microservice.py | 15 ++++-- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index fb2e63b32d..2a61331612 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -10,6 +10,7 @@ from fastapi.responses import JSONResponse from pydantic import BaseModel, Field +from fastapi import File, Form, UploadFile class ServiceCard(BaseModel): object: str = "service" @@ -80,21 +81,44 @@ class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] -class DataprepRequest(BaseModel): - files: Optional[Union[UploadFile, List[UploadFile]]] = None - link_list: Optional[str] = None - chunk_size: int = 1500 - chunk_overlap: int = 100 - process_table: bool = False - table_strategy: str = "fast" - +class DataprepRequest: + def __init__( + self, + db_type: str = Form(None), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + ): + self.db_type = db_type + self.files = files + self.link_list = link_list + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.process_table = process_table + self.table_strategy = table_strategy class Neo4jDataprepRequest(DataprepRequest): - ingest_from_graphDB: bool = False - + def __init__( + self, + ingest_from_graphDB: bool = Form(False), + **kwargs + ): + kwargs["db_type"] = "neo4j" + super().__init__(**kwargs) + self.ingest_from_graphDB = ingest_from_graphDB class RedisDataprepRequest(DataprepRequest): - index_name: Optional[str] = None + def __init__( + self, + index_name: Optional[str] = Form(None), + **kwargs + ): + kwargs["db_type"] = "redis" + super().__init__(**kwargs) + self.index_name = index_name class EmbeddingRequest(BaseModel): diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 34ceae74df..210bc8a3ad 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,9 +4,9 @@ import os import time -from typing import List, Optional, Union +from typing import List, Optional, Union, Annotated -from fastapi import Body, File, Form, UploadFile +from fastapi import Body, File, Form, UploadFile, Depends, HTTPException from integrations.elasticsearch import OpeaElasticSearchDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep @@ -50,7 +50,16 @@ port=5000, ) @register_statistics(names=["opea_service@dataprep"]) -async def ingest_files(input: Union[DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest]): +async def ingest_files( + base: Annotated[Optional[DataprepRequest], Depends()] = None, + redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, + neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, +): + input = redis or neo4j or base + + if input is None: + raise HTTPException(400, detail="Invalid request") + start = time.time() files = input.files From 0c994dc20a2d7402a9d33bbb5155e71ba468f467 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Apr 2025 23:27:51 +0000 Subject: [PATCH 06/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 17 +++++------------ .../dataprep/src/opea_dataprep_microservice.py | 4 ++-- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 2a61331612..29c3169775 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -6,11 +6,10 @@ from typing import Any, Dict, List, Literal, Optional, Union import shortuuid -from fastapi import File, UploadFile +from fastapi import File, Form, UploadFile from fastapi.responses import JSONResponse from pydantic import BaseModel, Field -from fastapi import File, Form, UploadFile class ServiceCard(BaseModel): object: str = "service" @@ -100,22 +99,16 @@ def __init__( self.process_table = process_table self.table_strategy = table_strategy + class Neo4jDataprepRequest(DataprepRequest): - def __init__( - self, - ingest_from_graphDB: bool = Form(False), - **kwargs - ): + def __init__(self, ingest_from_graphDB: bool = Form(False), **kwargs): kwargs["db_type"] = "neo4j" super().__init__(**kwargs) self.ingest_from_graphDB = ingest_from_graphDB + class RedisDataprepRequest(DataprepRequest): - def __init__( - self, - index_name: Optional[str] = Form(None), - **kwargs - ): + def __init__(self, index_name: Optional[str] = Form(None), **kwargs): kwargs["db_type"] = "redis" super().__init__(**kwargs) self.index_name = index_name diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 210bc8a3ad..16f736e875 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,9 +4,9 @@ import os import time -from typing import List, Optional, Union, Annotated +from typing import Annotated, List, Optional, Union -from fastapi import Body, File, Form, UploadFile, Depends, HTTPException +from fastapi import Body, Depends, File, Form, HTTPException, UploadFile from integrations.elasticsearch import OpeaElasticSearchDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep From 31cd4637050cb1f47902872b02b2b3e574d72d02 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 7 Apr 2025 19:31:17 -0400 Subject: [PATCH 07/36] fix: verbose `input` processing --- comps/dataprep/src/opea_dataprep_microservice.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 210bc8a3ad..90d23a2726 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -55,9 +55,16 @@ async def ingest_files( redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): - input = redis or neo4j or base - - if input is None: + input = None + if redis is not None: + input = redis + elif neo4j is not None: + input = neo4j + # elif ... + elif base is not None: + input = base + else: + logger.error("Error during dataprep ingest invocation: input is None") raise HTTPException(400, detail="Invalid request") start = time.time() From b22573e712c6a88c89c5fa6a9109ec3ad777d16f Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 08:56:35 -0400 Subject: [PATCH 08/36] attempt: replace `kwargs` with params --- comps/cores/proto/api_protocol.py | 50 +++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 29c3169775..e3709743ba 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -101,16 +101,54 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): - def __init__(self, ingest_from_graphDB: bool = Form(False), **kwargs): - kwargs["db_type"] = "neo4j" - super().__init__(**kwargs) + DB_TYPE = "neo4j" + + def __init__(self, + db_type: str = DB_TYPE, + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + ingest_from_graphDB: bool = Form(False) + ): + super().__init__( + db_type=db_type, + files=files, + link_list=link_list, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + self.ingest_from_graphDB = ingest_from_graphDB class RedisDataprepRequest(DataprepRequest): - def __init__(self, index_name: Optional[str] = Form(None), **kwargs): - kwargs["db_type"] = "redis" - super().__init__(**kwargs) + DB_TYPE = "redis" + + def __init__(self, + db_type: str = DB_TYPE, + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + index_name: Optional[str] = Form(None) + ): + super().__init__( + db_type=db_type, + files=files, + link_list=link_list, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + self.index_name = index_name From fedb099032da5a16151ce84ff2e0c6c358574563 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Apr 2025 13:01:19 +0000 Subject: [PATCH 09/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index e3709743ba..82e7723c92 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -103,7 +103,8 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): DB_TYPE = "neo4j" - def __init__(self, + def __init__( + self, db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), @@ -111,7 +112,7 @@ def __init__(self, chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False) + ingest_from_graphDB: bool = Form(False), ): super().__init__( db_type=db_type, @@ -129,7 +130,8 @@ def __init__(self, class RedisDataprepRequest(DataprepRequest): DB_TYPE = "redis" - def __init__(self, + def __init__( + self, db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), @@ -137,7 +139,7 @@ def __init__(self, chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - index_name: Optional[str] = Form(None) + index_name: Optional[str] = Form(None), ): super().__init__( db_type=db_type, From 1b74b90a0f63728080ec2a816a9a5331b15758be Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 11:42:21 -0400 Subject: [PATCH 10/36] rem: `db_type` ref: https://github.com/opea-project/GenAIComps/pull/1525#issuecomment-2785019373 --- comps/cores/proto/api_protocol.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index e963c9bfbc..a88c4e8df6 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -83,7 +83,6 @@ class TokenCheckResponse(BaseModel): class DataprepRequest: def __init__( self, - db_type: str = Form(None), files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -91,7 +90,6 @@ def __init__( process_table: bool = Form(False), table_strategy: str = Form("fast"), ): - self.db_type = db_type self.files = files self.link_list = link_list self.chunk_size = chunk_size @@ -101,11 +99,8 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): - DB_TYPE = "neo4j" - def __init__( self, - db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -115,7 +110,6 @@ def __init__( ingest_from_graphDB: bool = Form(False), ): super().__init__( - db_type=db_type, files=files, link_list=link_list, chunk_size=chunk_size, @@ -128,11 +122,8 @@ def __init__( class RedisDataprepRequest(DataprepRequest): - DB_TYPE = "redis" - def __init__( self, - db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -142,7 +133,6 @@ def __init__( index_name: Optional[str] = Form(None), ): super().__init__( - db_type=db_type, files=files, link_list=link_list, chunk_size=chunk_size, From 620ca6bc1d0b1a23f81f8d86168d65088712cfdf Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 13:43:51 -0400 Subject: [PATCH 11/36] attempt: require `base` --- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 1c574a1419..a1e73a3997 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -51,7 +51,7 @@ ) @register_statistics(names=["opea_service@dataprep"]) async def ingest_files( - base: Annotated[Optional[DataprepRequest], Depends()] = None, + base: Annotated[DataprepRequest, Depends()], redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): From 97842bc2dc388d22dd7f45ab9e601e8d7bbf42eb Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 14:36:17 -0400 Subject: [PATCH 12/36] Revert "attempt: require `base`" This reverts commit 620ca6bc1d0b1a23f81f8d86168d65088712cfdf. --- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index a1e73a3997..1c574a1419 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -51,7 +51,7 @@ ) @register_statistics(names=["opea_service@dataprep"]) async def ingest_files( - base: Annotated[DataprepRequest, Depends()], + base: Annotated[Optional[DataprepRequest], Depends()] = None, redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): From 40d90f36f3448f3aaa6a03a8910d3f420bd3afad Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 18:53:51 -0400 Subject: [PATCH 13/36] new: `DataprepRequest` --- comps/cores/proto/api_protocol.py | 14 +++++++ .../src/integrations/elasticsearch.py | 19 +++++----- comps/dataprep/src/integrations/milvus.py | 17 +++++---- .../src/integrations/milvus_multimodal.py | 4 +- .../src/integrations/neo4j_langchain.py | 17 +++++---- .../src/integrations/neo4j_llamaindex.py | 18 +++++---- comps/dataprep/src/integrations/opensearch.py | 17 +++++---- comps/dataprep/src/integrations/pgvect.py | 13 +++---- comps/dataprep/src/integrations/pipecone.py | 17 +++++---- comps/dataprep/src/integrations/qdrant.py | 17 +++++---- comps/dataprep/src/integrations/redis.py | 19 +++++----- .../src/integrations/redis_finance.py | 13 +++---- .../src/integrations/redis_multimodal.py | 6 ++- comps/dataprep/src/integrations/vdms.py | 17 +++++---- .../src/integrations/vdms_multimodal.py | 3 +- .../src/opea_dataprep_microservice.py | 38 +++---------------- 16 files changed, 123 insertions(+), 126 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 838a8c8fca..f293fe2c3f 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -79,6 +79,20 @@ class TokenCheckResponseItem(BaseModel): class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] +class DataprepRequest(BaseModel): + files: Optional[Union[UploadFile, List[UploadFile]]] = None + link_list: Optional[str] = None + chunk_size: int = 1500 + chunk_overlap: int = 100 + process_table: bool = False + table_strategy: str = "fast" + ingest_from_graphDB: bool = False + +class Neo4jDataprepRequest(DataprepRequest): + ingest_from_graphDB: bool = False + +class RedisDataprepRequest(DataprepRequest): + index_name: Optional[str] = None class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation diff --git a/comps/dataprep/src/integrations/elasticsearch.py b/comps/dataprep/src/integrations/elasticsearch.py index aaa1c9f1e2..e922413268 100644 --- a/comps/dataprep/src/integrations/elasticsearch.py +++ b/comps/dataprep/src/integrations/elasticsearch.py @@ -15,6 +15,7 @@ from langchain_huggingface import HuggingFaceEmbeddings from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -237,16 +238,7 @@ async def ingest_link_to_elastic(self, link_list: List[str]) -> None: if logflag: logger.info(f"Processed batch {i // batch_size + 1}/{(num_chunks - 1) // batch_size + 1}") - async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into ElasticSearch database. Save in the format of vector[768]. @@ -259,6 +251,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index f5957d320b..c8ff0cec8a 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -17,6 +17,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -245,14 +246,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into milvus database. @@ -266,6 +260,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ milvus ingest ] files:{files}") logger.info(f"[ milvus ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/milvus_multimodal.py b/comps/dataprep/src/integrations/milvus_multimodal.py index cd8f40e93a..336c0cebc5 100644 --- a/comps/dataprep/src/integrations/milvus_multimodal.py +++ b/comps/dataprep/src/integrations/milvus_multimodal.py @@ -21,6 +21,7 @@ from PIL import Image from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( @@ -591,7 +592,8 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): + files = input.files if logflag: logger.info(f"[ milvus ingest ] files:{files}") diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index c51dac7996..8b6d411162 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -16,6 +16,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -138,14 +139,7 @@ async def ingest_data_to_neo4j(self, doc_path: DocPath): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into Neo4j database. @@ -159,6 +153,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 68b7539038..b1ba7848af 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -36,6 +36,7 @@ from transformers import AutoTokenizer from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import Neo4jDataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -665,14 +666,7 @@ async def build_communities(self, index: PropertyGraphIndex): return False async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: Neo4jDataprepRequest, ): """Ingest files/links content into Neo4j database. @@ -687,6 +681,14 @@ async def ingest_files( table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + ingest_from_graphDB = input.ingest_from_graphDB + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index b06ced2877..44e3212049 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -14,6 +14,7 @@ from opensearchpy import OpenSearch from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -321,14 +322,7 @@ def search_all_documents(self, index_name, offset, search_batch_size): return None async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into opensearch database. @@ -342,6 +336,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ upload ] files:{files}") logger.info(f"[ upload ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index 61965427a8..f2484e3ca5 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -15,6 +15,7 @@ from langchain_huggingface import HuggingFaceEmbeddings from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -244,14 +245,7 @@ async def ingest_link_to_pgvector(self, link_list: List[str]): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into pgvector database. @@ -265,6 +259,9 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index dcdd16e95e..af843f2fa3 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -15,6 +15,7 @@ from pinecone import Pinecone, ServerlessSpec from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -230,14 +231,7 @@ async def ingest_link_to_pinecone(self, link_list: List[str], chunk_size, chunk_ return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest ): """Ingest files/links content into pipecone database. @@ -251,6 +245,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index af7c572649..840b8c461e 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -14,6 +14,7 @@ from qdrant_client import QdrantClient from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -152,14 +153,7 @@ async def ingest_data_to_qdrant(self, doc_path: DocPath): return True async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into qdrant database. @@ -173,6 +167,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"files:{files}") logger.info(f"link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 07485ebcef..52d568a2d1 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -21,6 +21,7 @@ from redis.commands.search.indexDefinition import IndexDefinition, IndexType from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import RedisDataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -355,15 +356,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - index_name: str = Form(None), + self, input: RedisDataprepRequest ): """Ingest files/links content into redis database. @@ -378,6 +371,14 @@ async def ingest_files( table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). index_name (str, optional): The name of the index where data will be ingested. """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + index_name = input.index_name + if logflag: logger.info(f"[ redis ingest ] files:{files}") logger.info(f"[ redis ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index d3fc50c9b1..d3eaa556c7 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -15,6 +15,7 @@ from comps import OpeaComponent, OpeaComponentRegistry, ServiceType from comps.dataprep.src.integrations.utils.redis_finance_utils import * from comps.dataprep.src.integrations.utils.redis_kv import RedisKVStore +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import encode_filename, save_content_to_local_disk logflag = os.getenv("LOGFLAG", False) @@ -221,14 +222,7 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into redis database. @@ -242,6 +236,9 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + if logflag: logger.info(f"[ redis ingest ] files:{files}") logger.info(f"[ redis ingest ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index 713db0bac5..ee8547564f 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -21,6 +21,8 @@ from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding +from comps.cores.proto.api_protocol import DataprepRequest + from .utils.multimodal import ( clear_upload_folder, @@ -651,7 +653,9 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): + files = input.files + if files: accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"] # Create a lookup dictionary containing all media files diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 5f389438af..9fb5f67db2 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -13,6 +13,7 @@ from langchain_vdms.vectorstores import VDMS, VDMS_Client from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -145,14 +146,7 @@ async def ingest_data_to_vdms(self, doc_path: DocPath): logger.info(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") async def ingest_files( - self, - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), + self, input: DataprepRequest, ): """Ingest files/links content into VDMS database. @@ -166,6 +160,13 @@ async def ingest_files( process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ + files = input.files + link_list = input.link_list + chunk_size = input.chunk_size + chunk_overlap = input.chunk_overlap + process_table = input.process_table + table_strategy = input.table_strategy + if logflag: logger.info(f"[ upload ] files:{files}") logger.info(f"[ upload ] link_list:{link_list}") diff --git a/comps/dataprep/src/integrations/vdms_multimodal.py b/comps/dataprep/src/integrations/vdms_multimodal.py index a17030dfec..00b72e0eeb 100644 --- a/comps/dataprep/src/integrations/vdms_multimodal.py +++ b/comps/dataprep/src/integrations/vdms_multimodal.py @@ -15,6 +15,7 @@ from tqdm import tqdm from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from .utils import store_embeddings from .utils.utils import process_all_videos, read_config @@ -194,7 +195,7 @@ async def ingest_generate_transcripts(self, files: List[UploadFile] = File(None) async def ingest_generate_caption(self, files: List[UploadFile] = File(None)): pass - async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + async def ingest_files(self, input: DataprepRequest): pass async def get_files(self): diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index ac5c5443ad..5d1b149432 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -28,6 +28,7 @@ statistics_dict, ) from comps.dataprep.src.utils import create_upload_folder +from comps.cores.proto.api_protocol import DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest logger = CustomLogger("opea_dataprep_microservice") logflag = os.getenv("LOGFLAG", False) @@ -49,45 +50,18 @@ port=5000, ) @register_statistics(names=["opea_service@dataprep"]) -async def ingest_files( - files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), - link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False), - index_name: Optional[str] = Form(None), -): +async def ingest_files(input: Union[DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest]): start = time.time() + files = input.files + link_list = input.link_list + if logflag: logger.info(f"[ ingest ] files:{files}") logger.info(f"[ ingest ] link_list:{link_list}") try: - # Use the loader to invoke the component - if dataprep_component_name == "OPEA_DATAPREP_REDIS": - response = await loader.ingest_files( - files, - link_list, - chunk_size, - chunk_overlap, - process_table, - table_strategy, - ingest_from_graphDB, - index_name, - ) - else: - if index_name: - logger.error( - 'Error during dataprep ingest invocation: "index_name" option is supported if "DATAPREP_COMPONENT_NAME" environment variable is set to "OPEA_DATAPREP_REDIS". i.e: export DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_REDIS"' - ) - raise - - response = await loader.ingest_files( - files, link_list, chunk_size, chunk_overlap, process_table, table_strategy, ingest_from_graphDB - ) + response = await loader.ingest_files(input) # Log the result if logging is enabled if logflag: From fee0afc8baf420df3e017923df750456823cc4ed Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 4 Apr 2025 22:56:10 +0000 Subject: [PATCH 14/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 4 ++++ comps/dataprep/src/integrations/milvus.py | 4 +--- comps/dataprep/src/integrations/neo4j_langchain.py | 4 +--- comps/dataprep/src/integrations/neo4j_llamaindex.py | 3 ++- comps/dataprep/src/integrations/opensearch.py | 3 ++- comps/dataprep/src/integrations/pgvect.py | 3 ++- comps/dataprep/src/integrations/pipecone.py | 4 +--- comps/dataprep/src/integrations/qdrant.py | 3 ++- comps/dataprep/src/integrations/redis.py | 4 +--- comps/dataprep/src/integrations/redis_finance.py | 5 +++-- comps/dataprep/src/integrations/redis_multimodal.py | 3 +-- comps/dataprep/src/integrations/vdms.py | 3 ++- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index f293fe2c3f..db5817b41a 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -79,6 +79,7 @@ class TokenCheckResponseItem(BaseModel): class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] + class DataprepRequest(BaseModel): files: Optional[Union[UploadFile, List[UploadFile]]] = None link_list: Optional[str] = None @@ -88,12 +89,15 @@ class DataprepRequest(BaseModel): table_strategy: str = "fast" ingest_from_graphDB: bool = False + class Neo4jDataprepRequest(DataprepRequest): ingest_from_graphDB: bool = False + class RedisDataprepRequest(DataprepRequest): index_name: Optional[str] = None + class EmbeddingRequest(BaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index c8ff0cec8a..68080736aa 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -245,9 +245,7 @@ def check_health(self) -> bool: def invoke(self, *args, **kwargs): pass - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into milvus database. Save in the format of vector[], the vector length depends on the emedding model type. diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index 8b6d411162..033b4a472e 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -138,9 +138,7 @@ async def ingest_data_to_neo4j(self, doc_path: DocPath): return True - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into Neo4j database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index b1ba7848af..6a9014b4fe 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -666,7 +666,8 @@ async def build_communities(self, index: PropertyGraphIndex): return False async def ingest_files( - self, input: Neo4jDataprepRequest, + self, + input: Neo4jDataprepRequest, ): """Ingest files/links content into Neo4j database. diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index 44e3212049..8aeb66ef44 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -322,7 +322,8 @@ def search_all_documents(self, index_name, offset, search_batch_size): return None async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into opensearch database. diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index f2484e3ca5..276496eeb0 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -245,7 +245,8 @@ async def ingest_link_to_pgvector(self, link_list: List[str]): return True async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into pgvector database. diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index af843f2fa3..3a06845211 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -230,9 +230,7 @@ async def ingest_link_to_pinecone(self, link_list: List[str], chunk_size, chunk_ return True - async def ingest_files( - self, input: DataprepRequest - ): + async def ingest_files(self, input: DataprepRequest): """Ingest files/links content into pipecone database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index 840b8c461e..4b633d86e0 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -153,7 +153,8 @@ async def ingest_data_to_qdrant(self, doc_path: DocPath): return True async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into qdrant database. diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 52d568a2d1..ffd233ea95 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -355,9 +355,7 @@ async def check_health(self) -> bool: def invoke(self, *args, **kwargs): pass - async def ingest_files( - self, input: RedisDataprepRequest - ): + async def ingest_files(self, input: RedisDataprepRequest): """Ingest files/links content into redis database. Save in the format of vector[768]. diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index d3eaa556c7..04b8286f67 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -13,9 +13,9 @@ from langchain_community.vectorstores import Redis from comps import OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.integrations.utils.redis_finance_utils import * from comps.dataprep.src.integrations.utils.redis_kv import RedisKVStore -from comps.cores.proto.api_protocol import DataprepRequest from comps.dataprep.src.utils import encode_filename, save_content_to_local_disk logflag = os.getenv("LOGFLAG", False) @@ -222,7 +222,8 @@ def invoke(self, *args, **kwargs): pass async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into redis database. diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index ee8547564f..efa40e2b5e 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -20,9 +20,8 @@ from PIL import Image from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from comps.cores.proto.api_protocol import DataprepRequest - +from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( clear_upload_folder, diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 9fb5f67db2..349734605f 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -146,7 +146,8 @@ async def ingest_data_to_vdms(self, doc_path: DocPath): logger.info(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") async def ingest_files( - self, input: DataprepRequest, + self, + input: DataprepRequest, ): """Ingest files/links content into VDMS database. diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 5d1b149432..34ceae74df 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -27,8 +27,8 @@ register_statistics, statistics_dict, ) +from comps.cores.proto.api_protocol import DataprepRequest, Neo4jDataprepRequest, RedisDataprepRequest from comps.dataprep.src.utils import create_upload_folder -from comps.cores.proto.api_protocol import DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest logger = CustomLogger("opea_dataprep_microservice") logflag = os.getenv("LOGFLAG", False) From f6978de9dc2e71eafee26dec497492403b59ba4e Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 19:04:20 -0400 Subject: [PATCH 15/36] fix: docstrings --- comps/dataprep/src/integrations/elasticsearch.py | 13 +++++++------ comps/dataprep/src/integrations/milvus.py | 13 +++++++------ .../dataprep/src/integrations/neo4j_langchain.py | 13 +++++++------ .../dataprep/src/integrations/neo4j_llamaindex.py | 15 ++++++++------- comps/dataprep/src/integrations/opensearch.py | 13 +++++++------ comps/dataprep/src/integrations/pgvect.py | 13 +++++++------ comps/dataprep/src/integrations/pipecone.py | 13 +++++++------ comps/dataprep/src/integrations/qdrant.py | 13 +++++++------ comps/dataprep/src/integrations/redis.py | 15 ++++++++------- comps/dataprep/src/integrations/redis_finance.py | 13 +++++++------ comps/dataprep/src/integrations/vdms.py | 13 +++++++------ 11 files changed, 79 insertions(+), 68 deletions(-) diff --git a/comps/dataprep/src/integrations/elasticsearch.py b/comps/dataprep/src/integrations/elasticsearch.py index e922413268..fb9d07e8de 100644 --- a/comps/dataprep/src/integrations/elasticsearch.py +++ b/comps/dataprep/src/integrations/elasticsearch.py @@ -244,12 +244,13 @@ async def ingest_files(self, input: DataprepRequest): Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/milvus.py b/comps/dataprep/src/integrations/milvus.py index 68080736aa..a78576a4f1 100644 --- a/comps/dataprep/src/integrations/milvus.py +++ b/comps/dataprep/src/integrations/milvus.py @@ -251,12 +251,13 @@ async def ingest_files(self, input: DataprepRequest): Save in the format of vector[], the vector length depends on the emedding model type. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/neo4j_langchain.py b/comps/dataprep/src/integrations/neo4j_langchain.py index 033b4a472e..9f3b460c3c 100644 --- a/comps/dataprep/src/integrations/neo4j_langchain.py +++ b/comps/dataprep/src/integrations/neo4j_langchain.py @@ -144,12 +144,13 @@ async def ingest_files(self, input: DataprepRequest): Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 6a9014b4fe..26031496d6 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -674,13 +674,14 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). - ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. + input (Neo4jDataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + ingest_from_graphDB (bool, optional): Whether to skip generating graph from files and instead loading index from existing graph store. """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/opensearch.py b/comps/dataprep/src/integrations/opensearch.py index 8aeb66ef44..2b51a5001c 100644 --- a/comps/dataprep/src/integrations/opensearch.py +++ b/comps/dataprep/src/integrations/opensearch.py @@ -330,12 +330,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/pgvect.py b/comps/dataprep/src/integrations/pgvect.py index 276496eeb0..e1aae31375 100644 --- a/comps/dataprep/src/integrations/pgvect.py +++ b/comps/dataprep/src/integrations/pgvect.py @@ -253,12 +253,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/pipecone.py b/comps/dataprep/src/integrations/pipecone.py index 3a06845211..b142adbc81 100644 --- a/comps/dataprep/src/integrations/pipecone.py +++ b/comps/dataprep/src/integrations/pipecone.py @@ -236,12 +236,13 @@ async def ingest_files(self, input: DataprepRequest): Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/qdrant.py b/comps/dataprep/src/integrations/qdrant.py index 4b633d86e0..386b718b89 100644 --- a/comps/dataprep/src/integrations/qdrant.py +++ b/comps/dataprep/src/integrations/qdrant.py @@ -161,12 +161,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index ffd233ea95..25bb642a47 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -361,13 +361,14 @@ async def ingest_files(self, input: RedisDataprepRequest): Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). - index_name (str, optional): The name of the index where data will be ingested. + input (RedisDataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + index_name (str, optional): The name of the index where data will be ingested. """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/redis_finance.py b/comps/dataprep/src/integrations/redis_finance.py index 04b8286f67..b4770531ff 100644 --- a/comps/dataprep/src/integrations/redis_finance.py +++ b/comps/dataprep/src/integrations/redis_finance.py @@ -230,12 +230,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list diff --git a/comps/dataprep/src/integrations/vdms.py b/comps/dataprep/src/integrations/vdms.py index 349734605f..fc530bec97 100644 --- a/comps/dataprep/src/integrations/vdms.py +++ b/comps/dataprep/src/integrations/vdms.py @@ -154,12 +154,13 @@ async def ingest_files( Save in the format of vector[768]. Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. Args: - files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). - link_list (str, optional): A list of links to be ingested. Defaults to Form(None). - chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). - chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). - process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). - table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). """ files = input.files link_list = input.link_list From 9e90414f5ccd0ad5d6824dc936bdee720170edcc Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 4 Apr 2025 19:24:40 -0400 Subject: [PATCH 16/36] rem: `ingest_from_graphDB` --- comps/cores/proto/api_protocol.py | 1 - 1 file changed, 1 deletion(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index db5817b41a..a8aebec6b0 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -87,7 +87,6 @@ class DataprepRequest(BaseModel): chunk_overlap: int = 100 process_table: bool = False table_strategy: str = "fast" - ingest_from_graphDB: bool = False class Neo4jDataprepRequest(DataprepRequest): From c43ffb2fff3125dd6ef583aa29cd42d578a5fce4 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 7 Apr 2025 19:27:17 -0400 Subject: [PATCH 17/36] new: dep injection --- comps/cores/proto/api_protocol.py | 46 ++++++++++++++----- .../src/opea_dataprep_microservice.py | 15 ++++-- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index a8aebec6b0..05ac4f82fa 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -10,6 +10,7 @@ from fastapi.responses import JSONResponse from pydantic import BaseModel, Field +from fastapi import File, Form, UploadFile class ServiceCard(BaseModel): object: str = "service" @@ -80,21 +81,44 @@ class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] -class DataprepRequest(BaseModel): - files: Optional[Union[UploadFile, List[UploadFile]]] = None - link_list: Optional[str] = None - chunk_size: int = 1500 - chunk_overlap: int = 100 - process_table: bool = False - table_strategy: str = "fast" - +class DataprepRequest: + def __init__( + self, + db_type: str = Form(None), + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + ): + self.db_type = db_type + self.files = files + self.link_list = link_list + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.process_table = process_table + self.table_strategy = table_strategy class Neo4jDataprepRequest(DataprepRequest): - ingest_from_graphDB: bool = False - + def __init__( + self, + ingest_from_graphDB: bool = Form(False), + **kwargs + ): + kwargs["db_type"] = "neo4j" + super().__init__(**kwargs) + self.ingest_from_graphDB = ingest_from_graphDB class RedisDataprepRequest(DataprepRequest): - index_name: Optional[str] = None + def __init__( + self, + index_name: Optional[str] = Form(None), + **kwargs + ): + kwargs["db_type"] = "redis" + super().__init__(**kwargs) + self.index_name = index_name class EmbeddingRequest(BaseModel): diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 34ceae74df..210bc8a3ad 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,9 +4,9 @@ import os import time -from typing import List, Optional, Union +from typing import List, Optional, Union, Annotated -from fastapi import Body, File, Form, UploadFile +from fastapi import Body, File, Form, UploadFile, Depends, HTTPException from integrations.elasticsearch import OpeaElasticSearchDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep @@ -50,7 +50,16 @@ port=5000, ) @register_statistics(names=["opea_service@dataprep"]) -async def ingest_files(input: Union[DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest]): +async def ingest_files( + base: Annotated[Optional[DataprepRequest], Depends()] = None, + redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, + neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, +): + input = redis or neo4j or base + + if input is None: + raise HTTPException(400, detail="Invalid request") + start = time.time() files = input.files From 367fed02df395f83dee6dbdcb4961fcd3c74b594 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Apr 2025 23:27:51 +0000 Subject: [PATCH 18/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 17 +++++------------ .../dataprep/src/opea_dataprep_microservice.py | 4 ++-- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 05ac4f82fa..257865ee0e 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -6,11 +6,10 @@ from typing import Any, Dict, List, Literal, Optional, Union import shortuuid -from fastapi import File, UploadFile +from fastapi import File, Form, UploadFile from fastapi.responses import JSONResponse from pydantic import BaseModel, Field -from fastapi import File, Form, UploadFile class ServiceCard(BaseModel): object: str = "service" @@ -100,22 +99,16 @@ def __init__( self.process_table = process_table self.table_strategy = table_strategy + class Neo4jDataprepRequest(DataprepRequest): - def __init__( - self, - ingest_from_graphDB: bool = Form(False), - **kwargs - ): + def __init__(self, ingest_from_graphDB: bool = Form(False), **kwargs): kwargs["db_type"] = "neo4j" super().__init__(**kwargs) self.ingest_from_graphDB = ingest_from_graphDB + class RedisDataprepRequest(DataprepRequest): - def __init__( - self, - index_name: Optional[str] = Form(None), - **kwargs - ): + def __init__(self, index_name: Optional[str] = Form(None), **kwargs): kwargs["db_type"] = "redis" super().__init__(**kwargs) self.index_name = index_name diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 210bc8a3ad..16f736e875 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,9 +4,9 @@ import os import time -from typing import List, Optional, Union, Annotated +from typing import Annotated, List, Optional, Union -from fastapi import Body, File, Form, UploadFile, Depends, HTTPException +from fastapi import Body, Depends, File, Form, HTTPException, UploadFile from integrations.elasticsearch import OpeaElasticSearchDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep From 8b089f6cbcbcecbb4c31eae28b235b4fe033da24 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Mon, 7 Apr 2025 19:31:17 -0400 Subject: [PATCH 19/36] fix: verbose `input` processing --- comps/dataprep/src/opea_dataprep_microservice.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 16f736e875..1c574a1419 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -55,9 +55,16 @@ async def ingest_files( redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): - input = redis or neo4j or base - - if input is None: + input = None + if redis is not None: + input = redis + elif neo4j is not None: + input = neo4j + # elif ... + elif base is not None: + input = base + else: + logger.error("Error during dataprep ingest invocation: input is None") raise HTTPException(400, detail="Invalid request") start = time.time() From 9998f21b1f97cd43633cbf00fb77c7d5b9cda066 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 08:56:35 -0400 Subject: [PATCH 20/36] attempt: replace `kwargs` with params --- comps/cores/proto/api_protocol.py | 50 +++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 257865ee0e..c49e8934f5 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -101,16 +101,54 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): - def __init__(self, ingest_from_graphDB: bool = Form(False), **kwargs): - kwargs["db_type"] = "neo4j" - super().__init__(**kwargs) + DB_TYPE = "neo4j" + + def __init__(self, + db_type: str = DB_TYPE, + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + ingest_from_graphDB: bool = Form(False) + ): + super().__init__( + db_type=db_type, + files=files, + link_list=link_list, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + self.ingest_from_graphDB = ingest_from_graphDB class RedisDataprepRequest(DataprepRequest): - def __init__(self, index_name: Optional[str] = Form(None), **kwargs): - kwargs["db_type"] = "redis" - super().__init__(**kwargs) + DB_TYPE = "redis" + + def __init__(self, + db_type: str = DB_TYPE, + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), + index_name: Optional[str] = Form(None) + ): + super().__init__( + db_type=db_type, + files=files, + link_list=link_list, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + self.index_name = index_name From c2c4cb0e4b0186c3e8086518278517a982a5404a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Apr 2025 13:01:19 +0000 Subject: [PATCH 21/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index c49e8934f5..e963c9bfbc 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -103,7 +103,8 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): DB_TYPE = "neo4j" - def __init__(self, + def __init__( + self, db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), @@ -111,7 +112,7 @@ def __init__(self, chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - ingest_from_graphDB: bool = Form(False) + ingest_from_graphDB: bool = Form(False), ): super().__init__( db_type=db_type, @@ -129,7 +130,8 @@ def __init__(self, class RedisDataprepRequest(DataprepRequest): DB_TYPE = "redis" - def __init__(self, + def __init__( + self, db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), @@ -137,7 +139,7 @@ def __init__(self, chunk_overlap: int = Form(100), process_table: bool = Form(False), table_strategy: str = Form("fast"), - index_name: Optional[str] = Form(None) + index_name: Optional[str] = Form(None), ): super().__init__( db_type=db_type, From 02cd3e9b0df4d61df8a82a82f50e3a189ece3d32 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 11:42:21 -0400 Subject: [PATCH 22/36] rem: `db_type` ref: https://github.com/opea-project/GenAIComps/pull/1525#issuecomment-2785019373 --- comps/cores/proto/api_protocol.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index e963c9bfbc..a88c4e8df6 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -83,7 +83,6 @@ class TokenCheckResponse(BaseModel): class DataprepRequest: def __init__( self, - db_type: str = Form(None), files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -91,7 +90,6 @@ def __init__( process_table: bool = Form(False), table_strategy: str = Form("fast"), ): - self.db_type = db_type self.files = files self.link_list = link_list self.chunk_size = chunk_size @@ -101,11 +99,8 @@ def __init__( class Neo4jDataprepRequest(DataprepRequest): - DB_TYPE = "neo4j" - def __init__( self, - db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -115,7 +110,6 @@ def __init__( ingest_from_graphDB: bool = Form(False), ): super().__init__( - db_type=db_type, files=files, link_list=link_list, chunk_size=chunk_size, @@ -128,11 +122,8 @@ def __init__( class RedisDataprepRequest(DataprepRequest): - DB_TYPE = "redis" - def __init__( self, - db_type: str = DB_TYPE, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), chunk_size: int = Form(1500), @@ -142,7 +133,6 @@ def __init__( index_name: Optional[str] = Form(None), ): super().__init__( - db_type=db_type, files=files, link_list=link_list, chunk_size=chunk_size, From 1192ae7a008d0c83ee16d576c9fc9fba4c85b2c9 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 13:43:51 -0400 Subject: [PATCH 23/36] attempt: require `base` --- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 1c574a1419..a1e73a3997 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -51,7 +51,7 @@ ) @register_statistics(names=["opea_service@dataprep"]) async def ingest_files( - base: Annotated[Optional[DataprepRequest], Depends()] = None, + base: Annotated[DataprepRequest, Depends()], redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): From 49090e37a25a46f6ab115b03c88996bcac22877a Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Tue, 8 Apr 2025 14:36:17 -0400 Subject: [PATCH 24/36] Revert "attempt: require `base`" This reverts commit 620ca6bc1d0b1a23f81f8d86168d65088712cfdf. --- comps/dataprep/src/opea_dataprep_microservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index a1e73a3997..1c574a1419 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -51,7 +51,7 @@ ) @register_statistics(names=["opea_service@dataprep"]) async def ingest_files( - base: Annotated[DataprepRequest, Depends()], + base: Annotated[Optional[DataprepRequest], Depends()] = None, redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, ): From 390b316190316cab058dd624cb2ec6044a16bbbc Mon Sep 17 00:00:00 2001 From: Letong Han <106566639+letonghan@users.noreply.github.com> Date: Sat, 12 Apr 2025 03:30:42 +0800 Subject: [PATCH 25/36] Fix dataprep request class issue of Redis (#1) * new: `DataprepRequest` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: docstrings * rem: `ingest_from_graphDB` * new: dep injection * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: verbose `input` processing * attempt: replace `kwargs` with params * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rem: `db_type` ref: https://github.com/opea-project/GenAIComps/pull/1525#issuecomment-2785019373 * attempt: require `base` * Revert "attempt: require `base`" This reverts commit 620ca6bc1d0b1a23f81f8d86168d65088712cfdf. * fix dataprep request class of redis Signed-off-by: letonghan * revert change in redis.py Signed-off-by: letonghan --------- Signed-off-by: letonghan Co-authored-by: Anthony Mahanna Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Anthony Mahanna <43019056+aMahanna@users.noreply.github.com> Co-authored-by: Liang Lv --- comps/cores/proto/api_protocol.py | 28 ++++---- .../src/opea_dataprep_microservice.py | 47 +++++++++---- tests/dataprep/dataprep_utils.sh | 70 ++++++++++++------- tests/dataprep/test_dataprep_redis.sh | 17 ++--- 4 files changed, 99 insertions(+), 63 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index a88c4e8df6..6aa35ed9eb 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -80,15 +80,15 @@ class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] -class DataprepRequest: +class DataprepRequest(): def __init__( self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + chunk_size: Optional[int] = Form(1500), + chunk_overlap: Optional[int] = Form(100), + process_table: Optional[bool] = Form(False), + table_strategy: Optional[str] = Form("fast"), ): self.files = files self.link_list = link_list @@ -103,10 +103,10 @@ def __init__( self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), + chunk_size: Optional[int] = Form(1500), + chunk_overlap: Optional[int] = Form(100), + process_table: Optional[bool] = Form(False), + table_strategy: Optional[str] = Form("fast"), ingest_from_graphDB: bool = Form(False), ): super().__init__( @@ -126,11 +126,11 @@ def __init__( self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None), - chunk_size: int = Form(1500), - chunk_overlap: int = Form(100), - process_table: bool = Form(False), - table_strategy: str = Form("fast"), - index_name: Optional[str] = Form(None), + chunk_size: Optional[int] = Form(1500), + chunk_overlap: Optional[int] = Form(100), + process_table: Optional[bool] = Form(False), + table_strategy: Optional[str] = Form("fast"), + index_name: str = Form(None), ): super().__init__( files=files, diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 1c574a1419..81ea8540b3 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,6 +4,7 @@ import os import time +from fastapi import Request from typing import Annotated, List, Optional, Union from fastapi import Body, Depends, File, Form, HTTPException, UploadFile @@ -42,6 +43,33 @@ ) +async def resolve_dataprep_request(request: Request): + form = await request.form() + + common_args = { + "files": form.get("files", None), + "link_list": form.get("link_list", None), + "chunk_size": form.get("chunk_size", 1500), + "chunk_overlap": form.get("chunk_overlap", 100), + "process_table": form.get("process_table", False), + "table_strategy": form.get("table_strategy", "fast"), + } + + if "index_name" in form: + return RedisDataprepRequest( + **common_args, + index_name=form.get("index_name"), + ) + + if "ingest_from_graphDB" in form: + return Neo4jDataprepRequest( + **common_args, + ingest_from_graphDB=form.get("ingest_from_graphDB"), + ) + + return DataprepRequest(**common_args) + + @register_microservice( name="opea_service@dataprep", service_type=ServiceType.DATAPREP, @@ -51,21 +79,14 @@ ) @register_statistics(names=["opea_service@dataprep"]) async def ingest_files( - base: Annotated[Optional[DataprepRequest], Depends()] = None, - redis: Annotated[Optional[RedisDataprepRequest], Depends()] = None, - neo4j: Annotated[Optional[Neo4jDataprepRequest], Depends()] = None, + input: Union[DataprepRequest, RedisDataprepRequest, Neo4jDataprepRequest] = Depends(resolve_dataprep_request), ): - input = None - if redis is not None: - input = redis - elif neo4j is not None: - input = neo4j - # elif ... - elif base is not None: - input = base + if isinstance(input, RedisDataprepRequest): + logger.info(f"[ ingest ] Redis mode: index_name={input.index_name}") + elif isinstance(input, Neo4jDataprepRequest): + logger.info(f"[ ingest ] Neo4j mode: ingest_from_graphDB={input.ingest_from_graphDB}") else: - logger.error("Error during dataprep ingest invocation: input is None") - raise HTTPException(400, detail="Invalid request") + logger.info(f"[ ingest ] Base mode") start = time.time() diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index 6416c37c65..cdd8253894 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -39,61 +39,85 @@ function _invoke_curl() { RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') } +# +function _add_db_params() { + local db=$1 + if [[ "$db" == "redis" ]]; then + echo '-F index_name=test' + fi +} + # validate_ingest function ingest_doc() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.doc" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.doc" $extra_args $@ } function ingest_docx() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.docx" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.docx" $extra_args $@ } function ingest_pdf() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pdf" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pdf" $extra_args $@ } function ingest_ppt() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.ppt" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.ppt" $extra_args $@ } function ingest_pptx() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pptx" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pptx" $extra_args $@ } function ingest_txt() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.txt" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.txt" $extra_args $@ } function ingest_xlsx() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.xlsx" $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.xlsx" $extra_args $@ } function ingest_external_link() { local fqdn=$1 local port=$2 - shift 2 - _invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' $@ + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") + _invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' $extra_args $@ } function delete_all() { @@ -106,7 +130,7 @@ function delete_all() { function delete_single() { local fqdn=$1 local port=$2 - shift 3 + shift 2 _invoke_curl $fqdn $port delete -d '{"file_path":"ingest_dataprep.txt"}' $@ } @@ -117,18 +141,12 @@ function get_all() { _invoke_curl $fqdn $port get $@ } -function ingest_txt_with_index_name() { - local fqdn=$1 - local port=$2 - local index_name=$3 - shift 3 - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.txt" -F "index_name=${index_name}" $@ -} - function indices() { local fqdn=$1 local port=$2 - shift 2 + local db=$3 + shift 3 + local extra_args=$(_add_db_params "$db") _invoke_curl $fqdn $port indices $@ } diff --git a/tests/dataprep/test_dataprep_redis.sh b/tests/dataprep/test_dataprep_redis.sh index cfc053e1d5..926ce09683 100644 --- a/tests/dataprep/test_dataprep_redis.sh +++ b/tests/dataprep/test_dataprep_redis.sh @@ -51,34 +51,31 @@ function validate_microservice() { check_result "dataprep - del" '{"status":true}' dataprep-redis-server ${LOG_PATH}/dataprep_del.log # test /v1/dataprep/ingest upload file - ingest_doc ${ip_address} ${DATAPREP_PORT} + ingest_doc ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_docx ${ip_address} ${DATAPREP_PORT} + ingest_docx ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - docx" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_pdf ${ip_address} ${DATAPREP_PORT} + ingest_pdf ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - pdf" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_ppt ${ip_address} ${DATAPREP_PORT} + ingest_ppt ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - ppt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_pptx ${ip_address} ${DATAPREP_PORT} + ingest_pptx ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - pptx" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_txt ${ip_address} ${DATAPREP_PORT} + ingest_txt ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_xlsx ${ip_address} ${DATAPREP_PORT} + ingest_xlsx ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log # test /v1/dataprep/ingest upload link ingest_external_link ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_txt_with_index_name ${ip_address} ${DATAPREP_PORT} rag_redis_test - check_result "dataprep - upload with index - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - # test /v1/dataprep/indices indices ${ip_address} ${DATAPREP_PORT} check_result "dataprep - indices" "['rag_redis_test']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log From 34e0eddb77e85ea248dd9df7479432ab9e948365 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 11 Apr 2025 19:31:14 +0000 Subject: [PATCH 26/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/api_protocol.py | 2 +- comps/dataprep/src/opea_dataprep_microservice.py | 7 +++---- tests/dataprep/dataprep_utils.sh | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 6aa35ed9eb..f8f14e0dca 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -80,7 +80,7 @@ class TokenCheckResponse(BaseModel): prompts: List[TokenCheckResponseItem] -class DataprepRequest(): +class DataprepRequest: def __init__( self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index 81ea8540b3..a398c9e1ff 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -4,10 +4,9 @@ import os import time -from fastapi import Request from typing import Annotated, List, Optional, Union -from fastapi import Body, Depends, File, Form, HTTPException, UploadFile +from fastapi import Body, Depends, File, Form, HTTPException, Request, UploadFile from integrations.elasticsearch import OpeaElasticSearchDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep @@ -45,7 +44,7 @@ async def resolve_dataprep_request(request: Request): form = await request.form() - + common_args = { "files": form.get("files", None), "link_list": form.get("link_list", None), @@ -86,7 +85,7 @@ async def ingest_files( elif isinstance(input, Neo4jDataprepRequest): logger.info(f"[ ingest ] Neo4j mode: ingest_from_graphDB={input.ingest_from_graphDB}") else: - logger.info(f"[ ingest ] Base mode") + logger.info("[ ingest ] Base mode") start = time.time() diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index cdd8253894..25094feff6 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -39,7 +39,7 @@ function _invoke_curl() { RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g') } -# +# function _add_db_params() { local db=$1 if [[ "$db" == "redis" ]]; then From ae842f1ced340f1d47183e40999b808c841962e0 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 11 Apr 2025 15:56:01 -0400 Subject: [PATCH 27/36] revert: `DataprepRequest` for multimodal --- comps/dataprep/src/integrations/milvus_multimodal.py | 3 +-- comps/dataprep/src/integrations/redis_multimodal.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/comps/dataprep/src/integrations/milvus_multimodal.py b/comps/dataprep/src/integrations/milvus_multimodal.py index 336c0cebc5..be04fce8ab 100644 --- a/comps/dataprep/src/integrations/milvus_multimodal.py +++ b/comps/dataprep/src/integrations/milvus_multimodal.py @@ -592,8 +592,7 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, input: DataprepRequest): - files = input.files + async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): if logflag: logger.info(f"[ milvus ingest ] files:{files}") diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index efa40e2b5e..3e3c308b32 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -652,8 +652,7 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") - async def ingest_files(self, input: DataprepRequest): - files = input.files + async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): if files: accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"] From a0e8f160f1e05ef536a7ac06b62d06e3eb97fb0c Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 11 Apr 2025 15:58:50 -0400 Subject: [PATCH 28/36] revert: `DataprepRequest` for multimodal (PT2) --- comps/dataprep/src/integrations/milvus_multimodal.py | 1 - comps/dataprep/src/integrations/redis_multimodal.py | 2 -- comps/dataprep/src/integrations/vdms_multimodal.py | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/comps/dataprep/src/integrations/milvus_multimodal.py b/comps/dataprep/src/integrations/milvus_multimodal.py index be04fce8ab..cd8f40e93a 100644 --- a/comps/dataprep/src/integrations/milvus_multimodal.py +++ b/comps/dataprep/src/integrations/milvus_multimodal.py @@ -21,7 +21,6 @@ from PIL import Image from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.cores.proto.api_protocol import DataprepRequest from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( diff --git a/comps/dataprep/src/integrations/redis_multimodal.py b/comps/dataprep/src/integrations/redis_multimodal.py index 3e3c308b32..713db0bac5 100644 --- a/comps/dataprep/src/integrations/redis_multimodal.py +++ b/comps/dataprep/src/integrations/redis_multimodal.py @@ -20,7 +20,6 @@ from PIL import Image from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.cores.proto.api_protocol import DataprepRequest from comps.third_parties.bridgetower.src.bridgetower_embedding import BridgeTowerEmbedding from .utils.multimodal import ( @@ -653,7 +652,6 @@ async def ingest_generate_captions(self, files: List[UploadFile] = File(None)): raise HTTPException(status_code=400, detail="Must provide at least one file.") async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): - if files: accepted_media_formats = [".mp4", ".png", ".jpg", ".jpeg", ".gif", ".pdf"] # Create a lookup dictionary containing all media files diff --git a/comps/dataprep/src/integrations/vdms_multimodal.py b/comps/dataprep/src/integrations/vdms_multimodal.py index 00b72e0eeb..a17030dfec 100644 --- a/comps/dataprep/src/integrations/vdms_multimodal.py +++ b/comps/dataprep/src/integrations/vdms_multimodal.py @@ -15,7 +15,6 @@ from tqdm import tqdm from comps import CustomLogger, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.cores.proto.api_protocol import DataprepRequest from .utils import store_embeddings from .utils.utils import process_all_videos, read_config @@ -195,7 +194,7 @@ async def ingest_generate_transcripts(self, files: List[UploadFile] = File(None) async def ingest_generate_caption(self, files: List[UploadFile] = File(None)): pass - async def ingest_files(self, input: DataprepRequest): + async def ingest_files(self, files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): pass async def get_files(self): From 4a912d85494527fb5a302d8acf88a2788587ffc0 Mon Sep 17 00:00:00 2001 From: Anthony Mahanna Date: Fri, 11 Apr 2025 16:19:11 -0400 Subject: [PATCH 29/36] fix: conditionally fetch unique `DataprepRequest` attributes --- comps/dataprep/src/integrations/neo4j_llamaindex.py | 12 ++++++------ comps/dataprep/src/integrations/redis.py | 9 ++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/comps/dataprep/src/integrations/neo4j_llamaindex.py b/comps/dataprep/src/integrations/neo4j_llamaindex.py index 26031496d6..a12c930597 100644 --- a/comps/dataprep/src/integrations/neo4j_llamaindex.py +++ b/comps/dataprep/src/integrations/neo4j_llamaindex.py @@ -36,7 +36,7 @@ from transformers import AutoTokenizer from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.cores.proto.api_protocol import Neo4jDataprepRequest +from comps.cores.proto.api_protocol import DataprepRequest, Neo4jDataprepRequest from comps.dataprep.src.utils import ( document_loader, encode_filename, @@ -665,10 +665,7 @@ async def build_communities(self, index: PropertyGraphIndex): logger.error(f"Error building communities: {e}\n{error_trace}") return False - async def ingest_files( - self, - input: Neo4jDataprepRequest, - ): + async def ingest_files(self, input: Union[DataprepRequest, Neo4jDataprepRequest]): """Ingest files/links content into Neo4j database. Save in the format of vector[768]. @@ -689,7 +686,10 @@ async def ingest_files( chunk_overlap = input.chunk_overlap process_table = input.process_table table_strategy = input.table_strategy - ingest_from_graphDB = input.ingest_from_graphDB + + ingest_from_graphDB = False + if isinstance(input, Neo4jDataprepRequest): + ingest_from_graphDB = input.ingest_from_graphDB if logflag: logger.info(f"files:{files}") diff --git a/comps/dataprep/src/integrations/redis.py b/comps/dataprep/src/integrations/redis.py index 25bb642a47..8f85d46564 100644 --- a/comps/dataprep/src/integrations/redis.py +++ b/comps/dataprep/src/integrations/redis.py @@ -21,7 +21,7 @@ from redis.commands.search.indexDefinition import IndexDefinition, IndexType from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType -from comps.cores.proto.api_protocol import RedisDataprepRequest +from comps.cores.proto.api_protocol import DataprepRequest, RedisDataprepRequest from comps.dataprep.src.utils import ( create_upload_folder, document_loader, @@ -355,7 +355,7 @@ async def check_health(self) -> bool: def invoke(self, *args, **kwargs): pass - async def ingest_files(self, input: RedisDataprepRequest): + async def ingest_files(self, input: Union[DataprepRequest, RedisDataprepRequest]): """Ingest files/links content into redis database. Save in the format of vector[768]. @@ -376,7 +376,10 @@ async def ingest_files(self, input: RedisDataprepRequest): chunk_overlap = input.chunk_overlap process_table = input.process_table table_strategy = input.table_strategy - index_name = input.index_name + + index_name = None + if isinstance(input, RedisDataprepRequest): + index_name = input.index_name if logflag: logger.info(f"[ redis ingest ] files:{files}") From a9028c3e3f533528b4c849bb21ebfefa7f311269 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 16:18:35 +0800 Subject: [PATCH 30/36] fix bugs in dataprep util script Signed-off-by: letonghan --- tests/dataprep/dataprep_utils.sh | 72 +++++++++--------------- tests/dataprep/test_dataprep_milvus.sh | 4 ++ tests/dataprep/test_dataprep_pgvector.sh | 4 ++ 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index 9fa91b85df..fa7e953919 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -47,75 +47,57 @@ function _add_db_params() { fi } -# validate_ingest -function ingest_doc() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 +function ingest_file() { + local fqdn="$1" + local port="$2" + local db_or_filename="$3" + local filename="$4" + + if [[ "$filename" == "" ]]; then + filename="$db_or_filename" + db="" + shift 3 + else + db="$db_or_filename" + shift 4 + fi + local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.doc" $extra_args $@ + _invoke_curl "$fqdn" "$port" ingest -F "files=@${SCRIPT_DIR}/${filename}" $extra_args "$@" +} + +function ingest_doc() { + ingest_file "$1" "$2" "$3" "ingest_dataprep.doc" "${@:5}" } function ingest_docx() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.docx" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.docx" "${@:5}" } function ingest_pdf() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pdf" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.pdf" "${@:5}" } function ingest_ppt() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.ppt" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.ppt" "${@:5}" } function ingest_pptx() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.pptx" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.pptx" "${@:5}" } function ingest_txt() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.txt" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.txt" "${@:5}" } function ingest_xlsx() { - local fqdn=$1 - local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") - _invoke_curl $fqdn $port ingest -F "files=@${SCRIPT_DIR}/ingest_dataprep.xlsx" $extra_args $@ + ingest_file "$1" "$2" "$3" "ingest_dataprep.xlsx" "${@:5}" } function ingest_external_link() { local fqdn=$1 local port=$2 - local db=$3 - shift 3 + shift 2 local extra_args=$(_add_db_params "$db") _invoke_curl $fqdn $port ingest -F 'link_list=["https://www.ces.tech/"]' $extra_args $@ } diff --git a/tests/dataprep/test_dataprep_milvus.sh b/tests/dataprep/test_dataprep_milvus.sh index 809506e766..839b5c31e0 100644 --- a/tests/dataprep/test_dataprep_milvus.sh +++ b/tests/dataprep/test_dataprep_milvus.sh @@ -42,6 +42,10 @@ function start_service() { } function validate_microservice() { + # test /v1/dataprep/delete + delete_all ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - del" '{"status":true}' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log + # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log diff --git a/tests/dataprep/test_dataprep_pgvector.sh b/tests/dataprep/test_dataprep_pgvector.sh index 554c55b6f6..4359f3c0c9 100644 --- a/tests/dataprep/test_dataprep_pgvector.sh +++ b/tests/dataprep/test_dataprep_pgvector.sh @@ -46,6 +46,10 @@ function start_service() { } function validate_microservice() { + # test /v1/dataprep/delete + delete_all ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - del" '{"status":true}' dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log + # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log From e87c14ec3b267aa52f82fc17ffdcf62cbb0e1f9e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 17 Apr 2025 08:19:17 +0000 Subject: [PATCH 31/36] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataprep/test_dataprep_milvus.sh | 2 +- tests/dataprep/test_dataprep_pgvector.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataprep/test_dataprep_milvus.sh b/tests/dataprep/test_dataprep_milvus.sh index 839b5c31e0..958e44ddf6 100644 --- a/tests/dataprep/test_dataprep_milvus.sh +++ b/tests/dataprep/test_dataprep_milvus.sh @@ -45,7 +45,7 @@ function validate_microservice() { # test /v1/dataprep/delete delete_all ${ip_address} ${DATAPREP_PORT} check_result "dataprep - del" '{"status":true}' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log - + # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log diff --git a/tests/dataprep/test_dataprep_pgvector.sh b/tests/dataprep/test_dataprep_pgvector.sh index 4359f3c0c9..e5d8fab606 100644 --- a/tests/dataprep/test_dataprep_pgvector.sh +++ b/tests/dataprep/test_dataprep_pgvector.sh @@ -49,7 +49,7 @@ function validate_microservice() { # test /v1/dataprep/delete delete_all ${ip_address} ${DATAPREP_PORT} check_result "dataprep - del" '{"status":true}' dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log - + # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log From 4961a094af7df4666e44933520f559b41a4d01f1 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 17:12:35 +0800 Subject: [PATCH 32/36] revert change of pgvector Signed-off-by: letonghan --- tests/dataprep/test_dataprep_pgvector.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/dataprep/test_dataprep_pgvector.sh b/tests/dataprep/test_dataprep_pgvector.sh index e5d8fab606..554c55b6f6 100644 --- a/tests/dataprep/test_dataprep_pgvector.sh +++ b/tests/dataprep/test_dataprep_pgvector.sh @@ -46,10 +46,6 @@ function start_service() { } function validate_microservice() { - # test /v1/dataprep/delete - delete_all ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - del" '{"status":true}' dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log - # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-pgvector-server ${LOG_PATH}/dataprep_pgvector.log From 0dbd8c0f0655152ac95ba4b34555d0b30a7c6b37 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 20:42:43 +0800 Subject: [PATCH 33/36] fix indices bug for redis Signed-off-by: letonghan --- tests/dataprep/dataprep_utils.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index fa7e953919..918cfa03dd 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -166,9 +166,7 @@ function ingest_txt_with_index_name() { function indices() { local fqdn=$1 local port=$2 - local db=$3 - shift 3 - local extra_args=$(_add_db_params "$db") + shift 2 _invoke_curl $fqdn $port indices $@ } From d364988e76124cb3b7c57eaffc76684956126a25 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 21:32:46 +0800 Subject: [PATCH 34/36] minor fix for redis Signed-off-by: letonghan --- tests/dataprep/dataprep_utils.sh | 2 +- tests/dataprep/test_dataprep_redis.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index 918cfa03dd..c3d86e1feb 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -43,7 +43,7 @@ function _invoke_curl() { function _add_db_params() { local db=$1 if [[ "$db" == "redis" ]]; then - echo '-F index_name=test' + echo '-F index_name=rag_redis' fi } diff --git a/tests/dataprep/test_dataprep_redis.sh b/tests/dataprep/test_dataprep_redis.sh index 47fbd073e4..816daecb97 100644 --- a/tests/dataprep/test_dataprep_redis.sh +++ b/tests/dataprep/test_dataprep_redis.sh @@ -79,7 +79,7 @@ function validate_microservice() { # test /v1/dataprep/indices indices ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - indices" "['rag_redis_test']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + check_result "dataprep - indices" "['rag_redis']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log # test /v1/dataprep/get get_all ${ip_address} ${DATAPREP_PORT} From 7104c1b806a58f9d7c9ebea732ea18b1d3a2d7f2 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 22:11:54 +0800 Subject: [PATCH 35/36] ingest file into rag_redis_test Signed-off-by: letonghan --- tests/dataprep/test_dataprep_redis.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/dataprep/test_dataprep_redis.sh b/tests/dataprep/test_dataprep_redis.sh index 816daecb97..9e8105d625 100644 --- a/tests/dataprep/test_dataprep_redis.sh +++ b/tests/dataprep/test_dataprep_redis.sh @@ -77,6 +77,9 @@ function validate_microservice() { ingest_external_link ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + ingest_txt_with_index_name ${ip_address} ${DATAPREP_PORT} rag_redis_test + check_result "dataprep - upload with index - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + # test /v1/dataprep/indices indices ${ip_address} ${DATAPREP_PORT} check_result "dataprep - indices" "['rag_redis']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log From 91229375792f7baedf82598ee4da46a8eb201e56 Mon Sep 17 00:00:00 2001 From: letonghan Date: Thu, 17 Apr 2025 22:13:57 +0800 Subject: [PATCH 36/36] update indice name Signed-off-by: letonghan --- tests/dataprep/test_dataprep_redis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dataprep/test_dataprep_redis.sh b/tests/dataprep/test_dataprep_redis.sh index 9e8105d625..e07961d7eb 100644 --- a/tests/dataprep/test_dataprep_redis.sh +++ b/tests/dataprep/test_dataprep_redis.sh @@ -82,7 +82,7 @@ function validate_microservice() { # test /v1/dataprep/indices indices ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - indices" "['rag_redis']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + check_result "dataprep - indices" "['rag_redis_test']" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log # test /v1/dataprep/get get_all ${ip_address} ${DATAPREP_PORT}