opea-project · rbrugaro · Apr 8, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 31, 2025
@@ -20,7 +20,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-elasticsearch
     ports:
-      - "${DATAPREP_PORT:-11100}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -40,7 +40,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-milvus-server
     ports:
-      - "${DATAPREP_PORT:-11101}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -66,7 +66,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-multimodal-milvus-server
     ports:
-      - "${DATAPREP_PORT:-11102}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       standalone:
         condition: service_healthy
@@ -97,7 +97,7 @@ services:
       tei-embedding-serving:
         condition: service_healthy
     ports:
-      - "${DATAPREP_PORT:-11103}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -125,7 +125,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-opensearch-server
     ports:
-      - "${DATAPREP_PORT:-11104}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       opensearch-vector-db:
         condition: service_healthy
@@ -147,7 +147,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-pgvector-server
     ports:
-      - "${DATAPREP_PORT:-11105}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       pgvector-db:
         condition: service_healthy
@@ -164,7 +164,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-pinecone-server
     ports:
-      - "${DATAPREP_PORT:-11106}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -185,7 +185,7 @@ services:
       tei-embedding-serving:
         condition: service_healthy
     ports:
-      - "${DATAPREP_PORT:-11107}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -208,7 +208,7 @@ services:
       tei-embedding-serving:
         condition: service_healthy
     ports:
-      - "${DATAPREP_PORT:-11108}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     ipc: host
     environment:
       no_proxy: ${no_proxy}
@@ -227,7 +227,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-multimodal-redis-server
     ports:
-      - "${DATAPREP_PORT:-11109}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       redis-vector-db:
         condition: service_healthy
@@ -250,7 +250,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-vdms-server
     ports:
-      - "${DATAPREP_PORT:-11110}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       vdms-vector-db:
         condition: service_healthy
@@ -270,7 +270,7 @@ services:
     image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
     container_name: dataprep-vdms-multimodal-server
     ports:
-      - "${DATAPREP_PORT:-11111}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     depends_on:
       vdms-vector-db:
         condition: service_healthy
@@ -297,7 +297,7 @@ services:
       tei-embedding-serving:
         condition: service_healthy
     ports:
-      - "${DATAPREP_PORT:-11108}:5000"
+      - "${DATAPREP_PORT:-5000}:5000"
     environment:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}

@@ -11,7 +11,16 @@ This microservice follows the graphRAG approached defined by Microsoft paper ["F
 
 This dataprep microservice ingests the input files and uses LLM (TGI, VLLM or OpenAI model when OPENAI_API_KEY is set) to extract entities, relationships and descriptions of those to build a graph-based text index. Compose yaml file deploys TGI but works also with vLLM inference endpoint.
 
-## Setup Environment Variables
+## 🚀Start Microservice with Docker
+
+### 1. Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
+```
+
+## 2. Setup Environment Variables
 
 ```bash
 # Manually set private environment settings
@@ -34,46 +43,20 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
 export MAX_INPUT_TOKENS=4096
 export MAX_TOTAL_TOKENS=8192
 export OPENAI_LLM_MODEL="gpt-4o"
-export TEI_EMBEDDER_PORT=11633
+export TEI_EMBEDDER_PORT=8090
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
-export LLM_ENDPOINT_PORT=11634
+export LLM_ENDPOINT_PORT=8008
 export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
 export NEO4J_AUTH="${NEO4J_USERNAME}/${NEO4J_PASSWORD}"
-export NEO4J_PORT1=7474   # 11631
-export NEO4J_PORT2=7687   # 11632
+export NEO4J_PORT1=7474
+export NEO4J_PORT2=7687
 export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
 export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
-export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6004/v1/dataprep"
+export DATAPREP_PORT=5000
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_PORT}/v1/dataprep"
 export LOGFLAG=True
 ```
 
-## 🚀Start Microservice with Docker
-
-### 1. Build Docker Image
-
-```bash
-cd ../../../../
-docker build -t opea/dataprep-neo4j-llamaindex:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
-```
-
-### 2. Setup Environment Variables
-
-```bash
-# Set private environment settings
-export host_ip=${your_hostname IP}  # local IP
-export no_proxy=$no_proxy,${host_ip}  # important to add {host_ip} for containers communication
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
-export NEO4J_URI=${your_neo4j_url}
-export NEO4J_USERNAME=${your_neo4j_username}
-export NEO4J_PASSWORD=${your_neo4j_password}
-export PYTHONPATH=${path_to_comps}
-export OPENAI_KEY=${your_openai_api_key}  # optional, when not provided will use smaller models TGI/TEI
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
-# set additional environment settings
-source ./set_env.sh
-```
-
 ### 3. Run Docker with Docker Compose
 
 Docker compose will start 4 microservices: dataprep-neo4j-llamaindex, neo4j-apoc, tgi-gaudi-service and tei-embedding-service. The reason TGI and TEI are needed is because dataprep relies on LLM to extract entities and relationships from text to build the graph and Neo4j Property Graph Index. Neo4j database supports embeddings natively so we do not need a separate vector store. Checkout the blog [Introducing the Property Graph Index: A Powerful New Way to Build Knowledge Graphs with LLMs](https://www.llamaindex.ai/blog/introducing-the-property-graph-index-a-powerful-new-way-to-build-knowledge-graphs-with-llms) for a better understanding of Property Graph Store and Index.
@@ -91,7 +74,7 @@ Once document preparation microservice for Neo4J is started, user can use below
 curl -X POST \
     -H "Content-Type: multipart/form-data" \
     -F "files=@./file1.txt" \
-    http://${host_ip}:6004/v1/dataprep/ingest
+    http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest
 ```
 
 You can specify chunk_size and chunk_size by the following commands.
@@ -102,7 +85,7 @@ curl -X POST \
     -F "files=@./file1.txt" \
     -F "chunk_size=1500" \
     -F "chunk_overlap=100" \
-    http://${host_ip}:6004/v1/dataprep/ingest
+    http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest
 ```
 
 Please note that clustering of extracted entities and summarization happens in this data preparation step. The result of this is:
@@ -122,5 +105,5 @@ curl -X POST \
     -F "files=@./your_file.pdf" \
     -F "process_table=true" \
     -F "table_strategy=hq" \
-    http://localhost:6004/v1/dataprep/ingest
+    http://localhost:${DATAPREP_PORT}/v1/dataprep/ingest
 ```
@@ -60,7 +60,7 @@
 from llama_index.core.schema import BaseNode, TransformComponent
 
 host_ip = os.getenv("host_ip")
-NEO4J_PORT2 = os.getenv("NEO4J_PORT2")
+NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "7687")
 # Neo4J configuration
 NEO4J_URL = os.getenv("NEO4J_URL", f"bolt://{host_ip}:{NEO4J_PORT2}")
 NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")

@@ -17,25 +17,6 @@ cd ../../../
 docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
 ```
 
-### 2. Install Requirements
-
-```bash
-pip install -r requirements.txt
-```
-
-### 3. Start Neo4j VectorDB Service
-
-```bash
-docker run \
-    -p 7474:7474 -p 7687:7687 \
-    -v $PWD/data:/data -v $PWD/plugins:/plugins \
-    --name neo4j-apoc \
-    -d \
-    -e NEO4J_AUTH=neo4j/password \
-    -e NEO4J_PLUGINS=\[\"apoc\"\]  \
-    neo4j:latest
-```
-
 ### 2. Setup Environment Variables
 
 ```bash
@@ -58,18 +39,17 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
 export MAX_INPUT_TOKENS=4096
 export MAX_TOTAL_TOKENS=8192
 export OPENAI_LLM_MODEL="gpt-4o"
-export TEI_EMBEDDER_PORT=11633
+export TEI_EMBEDDER_PORT=8090
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
-export LLM_ENDPOINT_PORT=11634
+export LLM_ENDPOINT_PORT=8008
 export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
-export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
-export TGI_LLM_ENDPOINT="http://${host_ip}:6005"
-export NEO4J_PORT1=7474   # 11631
-export NEO4J_PORT2=7687   # 11632
+export NEO4J_PORT1=7474
+export NEO4J_PORT2=7687
 export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
 export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
-export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6004/v1/dataprep"
-export RETRIEVER_PORT=11635
+export DATAPREP_PORT=5000
+export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_PORT}/v1/dataprep"
+export RETRIEVER_PORT=7000
 export LOGFLAG=True
 ```
 
@@ -88,7 +68,7 @@ docker compose -f compose.yaml up ${service_name} -d
 ### 3.1 Check Service Status
 
 ```bash
-curl http://${host_ip}:7000/v1/health_check \
+curl http://${host_ip}:${RETRIEVER_PORT}/v1/health_check \
   -X GET \
   -H 'Content-Type: application/json'
 ```
@@ -98,7 +78,7 @@ curl http://${host_ip}:7000/v1/health_check \
 If OPEN_AI_KEY is provided it will use OPENAI endpoints for LLM and Embeddings otherwise will use TGI and TEI endpoints. If a model name not provided in the request it will use the default specified by the set_env.sh script.
 
 ```bash
-curl -X POST http://${host_ip}:7000/v1/retrieval \
+curl -X POST http://${host_ip}:${RETRIEVER_PORT}/v1/retrieval \
   -H "Content-Type: application/json" \
   -d '{"model": "gpt-3.5-turbo","messages": [{"role": "user","content": "Who is John Brady and has he had any confrontations?"}]}'
 ```
@@ -63,7 +63,7 @@ def get_boolean_env_var(var_name, default_value=False):
 #######################################################
 #                    Neo4j                            #
 #######################################################
-NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "11632")
+NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "7687")
 NEO4J_URL = os.getenv("NEO4J_URI", f"bolt://localhost:{NEO4J_PORT2}")
 NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
 NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test")

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT=11100
+export DATAPREP_PORT=11100
 export TAG="comps"
 
 function build_docker_images() {

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT=11101
+export DATAPREP_PORT=11101
 service_name="dataprep-milvus tei-embedding-serving etcd minio standalone"
 export TAG="comps"
 export DATA_PATH=${model_cache}

@@ -22,7 +22,7 @@ image_fn="${tmp_dir}/${image_name}.png"
 caption_fn="${tmp_dir}/${image_name}.txt"
 pdf_name="nke-10k-2023"
 pdf_fn="${tmp_dir}/${pdf_name}.pdf"
-DATAPREP_PORT="11102"
+export DATAPREP_PORT="11102"
 
 function build_docker_images() {
     cd $WORKPATH

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT=11103
+export DATAPREP_PORT=11103
 LLM_ENDPOINT_PORT=10510
 export TAG="comps"
 export DATA_PATH=${model_cache}
@@ -29,8 +29,8 @@ function build_docker_images() {
 function start_service() {
     service_name="neo4j-apoc tei-embedding-serving tgi-gaudi-server dataprep-neo4j-llamaindex"
     export host_ip=${ip_address}
-    export NEO4J_PORT1=7474   # 11631
-    export NEO4J_PORT2=7687   # 11632
+    export NEO4J_PORT1=11631
+    export NEO4J_PORT2=11632
     export NEO4J_AUTH="neo4j/neo4jtest"
     export NEO4J_URL="bolt://${ip_address}:${NEO4J_PORT2}"
     export NEO4J_USERNAME="neo4j"

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT="11104"
+export DATAPREP_PORT="11104"
 OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)"
 export TAG="comps"
 

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT="11105"
+export DATAPREP_PORT="11105"
 export TAG="comps"
 
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

@@ -6,7 +6,7 @@ set -x
 
 WORKPATH=$(dirname "$PWD")
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT="11106"
+export DATAPREP_PORT="11106"
 export TAG="comps"
 
 function build_docker_images() {

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT="11107"
+export DATAPREP_PORT="11107"
 TEI_EMBEDDER_PORT="10220"
 export TAG="comps"
 export DATA_PATH=${model_cache}

@@ -7,7 +7,7 @@ set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
-DATAPREP_PORT="11108"
+export DATAPREP_PORT="11108"
 TEI_EMBEDDER_PORT="10221"
 export TAG="comps"
 export DATA_PATH=${model_cache}