Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 13 additions & 13 deletions comps/dataprep/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-elasticsearch
ports:
- "${DATAPREP_PORT:-11100}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand All @@ -40,7 +40,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-milvus-server
ports:
- "${DATAPREP_PORT:-11101}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand All @@ -66,7 +66,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-multimodal-milvus-server
ports:
- "${DATAPREP_PORT:-11102}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
standalone:
condition: service_healthy
Expand Down Expand Up @@ -97,7 +97,7 @@ services:
tei-embedding-serving:
condition: service_healthy
ports:
- "${DATAPREP_PORT:-11103}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand Down Expand Up @@ -125,7 +125,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-opensearch-server
ports:
- "${DATAPREP_PORT:-11104}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
opensearch-vector-db:
condition: service_healthy
Expand All @@ -147,7 +147,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-pgvector-server
ports:
- "${DATAPREP_PORT:-11105}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
pgvector-db:
condition: service_healthy
Expand All @@ -164,7 +164,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-pinecone-server
ports:
- "${DATAPREP_PORT:-11106}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand All @@ -185,7 +185,7 @@ services:
tei-embedding-serving:
condition: service_healthy
ports:
- "${DATAPREP_PORT:-11107}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand All @@ -208,7 +208,7 @@ services:
tei-embedding-serving:
condition: service_healthy
ports:
- "${DATAPREP_PORT:-11108}:5000"
- "${DATAPREP_PORT:-5000}:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
Expand All @@ -227,7 +227,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-multimodal-redis-server
ports:
- "${DATAPREP_PORT:-11109}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
redis-vector-db:
condition: service_healthy
Expand All @@ -250,7 +250,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-vdms-server
ports:
- "${DATAPREP_PORT:-11110}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
vdms-vector-db:
condition: service_healthy
Expand All @@ -270,7 +270,7 @@ services:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-vdms-multimodal-server
ports:
- "${DATAPREP_PORT:-11111}:5000"
- "${DATAPREP_PORT:-5000}:5000"
depends_on:
vdms-vector-db:
condition: service_healthy
Expand All @@ -297,7 +297,7 @@ services:
tei-embedding-serving:
condition: service_healthy
ports:
- "${DATAPREP_PORT:-11108}:5000"
- "${DATAPREP_PORT:-5000}:5000"
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
Expand Down
55 changes: 19 additions & 36 deletions comps/dataprep/src/README_neo4j_llamaindex.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@ This microservice follows the graphRAG approached defined by Microsoft paper ["F

This dataprep microservice ingests the input files and uses LLM (TGI, VLLM or OpenAI model when OPENAI_API_KEY is set) to extract entities, relationships and descriptions of those to build a graph-based text index. Compose yaml file deploys TGI but works also with vLLM inference endpoint.

## Setup Environment Variables
## 🚀Start Microservice with Docker

### 1. Build Docker Image

```bash
cd ../../../../
docker build -t opea/dataprep:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
```

## 2. Setup Environment Variables

```bash
# Manually set private environment settings
Expand All @@ -34,46 +43,20 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
export MAX_INPUT_TOKENS=4096
export MAX_TOTAL_TOKENS=8192
export OPENAI_LLM_MODEL="gpt-4o"
export TEI_EMBEDDER_PORT=11633
export TEI_EMBEDDER_PORT=8090
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
export LLM_ENDPOINT_PORT=11634
export LLM_ENDPOINT_PORT=8008
export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export NEO4J_AUTH="${NEO4J_USERNAME}/${NEO4J_PASSWORD}"
export NEO4J_PORT1=7474 # 11631
export NEO4J_PORT2=7687 # 11632
export NEO4J_PORT1=7474
export NEO4J_PORT2=7687
export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6004/v1/dataprep"
export DATAPREP_PORT=5000
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_PORT}/v1/dataprep"
export LOGFLAG=True
```

## 🚀Start Microservice with Docker

### 1. Build Docker Image

```bash
cd ../../../../
docker build -t opea/dataprep-neo4j-llamaindex:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
```

### 2. Setup Environment Variables

```bash
# Set private environment settings
export host_ip=${your_hostname IP} # local IP
export no_proxy=$no_proxy,${host_ip} # important to add {host_ip} for containers communication
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
export NEO4J_URI=${your_neo4j_url}
export NEO4J_USERNAME=${your_neo4j_username}
export NEO4J_PASSWORD=${your_neo4j_password}
export PYTHONPATH=${path_to_comps}
export OPENAI_KEY=${your_openai_api_key} # optional, when not provided will use smaller models TGI/TEI
export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
# set additional environment settings
source ./set_env.sh
```

### 3. Run Docker with Docker Compose

Docker compose will start 4 microservices: dataprep-neo4j-llamaindex, neo4j-apoc, tgi-gaudi-service and tei-embedding-service. The reason TGI and TEI are needed is because dataprep relies on LLM to extract entities and relationships from text to build the graph and Neo4j Property Graph Index. Neo4j database supports embeddings natively so we do not need a separate vector store. Checkout the blog [Introducing the Property Graph Index: A Powerful New Way to Build Knowledge Graphs with LLMs](https://www.llamaindex.ai/blog/introducing-the-property-graph-index-a-powerful-new-way-to-build-knowledge-graphs-with-llms) for a better understanding of Property Graph Store and Index.
Expand All @@ -91,7 +74,7 @@ Once document preparation microservice for Neo4J is started, user can use below
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file1.txt" \
http://${host_ip}:6004/v1/dataprep/ingest
http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest
```

You can specify chunk_size and chunk_size by the following commands.
Expand All @@ -102,7 +85,7 @@ curl -X POST \
-F "files=@./file1.txt" \
-F "chunk_size=1500" \
-F "chunk_overlap=100" \
http://${host_ip}:6004/v1/dataprep/ingest
http://${host_ip}:${DATAPREP_PORT}/v1/dataprep/ingest
```

Please note that clustering of extracted entities and summarization happens in this data preparation step. The result of this is:
Expand All @@ -122,5 +105,5 @@ curl -X POST \
-F "files=@./your_file.pdf" \
-F "process_table=true" \
-F "table_strategy=hq" \
http://localhost:6004/v1/dataprep/ingest
http://localhost:${DATAPREP_PORT}/v1/dataprep/ingest
```
2 changes: 1 addition & 1 deletion comps/dataprep/src/integrations/neo4j_llamaindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from llama_index.core.schema import BaseNode, TransformComponent

host_ip = os.getenv("host_ip")
NEO4J_PORT2 = os.getenv("NEO4J_PORT2")
NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "7687")
# Neo4J configuration
NEO4J_URL = os.getenv("NEO4J_URL", f"bolt://{host_ip}:{NEO4J_PORT2}")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
Expand Down
38 changes: 9 additions & 29 deletions comps/retrievers/src/README_neo4j.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,6 @@ cd ../../../
docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
```

### 2. Install Requirements

```bash
pip install -r requirements.txt
```

### 3. Start Neo4j VectorDB Service

```bash
docker run \
-p 7474:7474 -p 7687:7687 \
-v $PWD/data:/data -v $PWD/plugins:/plugins \
--name neo4j-apoc \
-d \
-e NEO4J_AUTH=neo4j/password \
-e NEO4J_PLUGINS=\[\"apoc\"\] \
neo4j:latest
```

### 2. Setup Environment Variables

```bash
Expand All @@ -58,18 +39,17 @@ export LLM_MODEL_ID="meta-llama/Meta-Llama-3.1-8B-Instruct"
export MAX_INPUT_TOKENS=4096
export MAX_TOTAL_TOKENS=8192
export OPENAI_LLM_MODEL="gpt-4o"
export TEI_EMBEDDER_PORT=11633
export TEI_EMBEDDER_PORT=8090
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
export LLM_ENDPOINT_PORT=11634
export LLM_ENDPOINT_PORT=8008
export TGI_LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
export TGI_LLM_ENDPOINT="http://${host_ip}:6005"
export NEO4J_PORT1=7474 # 11631
export NEO4J_PORT2=7687 # 11632
export NEO4J_PORT1=7474
export NEO4J_PORT2=7687
export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6004/v1/dataprep"
export RETRIEVER_PORT=11635
export DATAPREP_PORT=5000
export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:${DATAPREP_PORT}/v1/dataprep"
export RETRIEVER_PORT=7000
export LOGFLAG=True
```

Expand All @@ -88,7 +68,7 @@ docker compose -f compose.yaml up ${service_name} -d
### 3.1 Check Service Status

```bash
curl http://${host_ip}:7000/v1/health_check \
curl http://${host_ip}:${RETRIEVER_PORT}/v1/health_check \
-X GET \
-H 'Content-Type: application/json'
```
Expand All @@ -98,7 +78,7 @@ curl http://${host_ip}:7000/v1/health_check \
If OPEN_AI_KEY is provided it will use OPENAI endpoints for LLM and Embeddings otherwise will use TGI and TEI endpoints. If a model name not provided in the request it will use the default specified by the set_env.sh script.

```bash
curl -X POST http://${host_ip}:7000/v1/retrieval \
curl -X POST http://${host_ip}:${RETRIEVER_PORT}/v1/retrieval \
-H "Content-Type: application/json" \
-d '{"model": "gpt-3.5-turbo","messages": [{"role": "user","content": "Who is John Brady and has he had any confrontations?"}]}'
```
2 changes: 1 addition & 1 deletion comps/retrievers/src/integrations/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def get_boolean_env_var(var_name, default_value=False):
#######################################################
# Neo4j #
#######################################################
NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "11632")
NEO4J_PORT2 = os.getenv("NEO4J_PORT2", "7687")
NEO4J_URL = os.getenv("NEO4J_URI", f"bolt://localhost:{NEO4J_PORT2}")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test")
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_elasticsearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT=11100
export DATAPREP_PORT=11100
export TAG="comps"

function build_docker_images() {
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_milvus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT=11101
export DATAPREP_PORT=11101
service_name="dataprep-milvus tei-embedding-serving etcd minio standalone"
export TAG="comps"
export DATA_PATH=${model_cache}
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_milvus_multimodal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ image_fn="${tmp_dir}/${image_name}.png"
caption_fn="${tmp_dir}/${image_name}.txt"
pdf_name="nke-10k-2023"
pdf_fn="${tmp_dir}/${pdf_name}.pdf"
DATAPREP_PORT="11102"
export DATAPREP_PORT="11102"

function build_docker_images() {
cd $WORKPATH
Expand Down
6 changes: 3 additions & 3 deletions tests/dataprep/test_dataprep_neo4j_on_intel_hpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT=11103
export DATAPREP_PORT=11103
LLM_ENDPOINT_PORT=10510
export TAG="comps"
export DATA_PATH=${model_cache}
Expand All @@ -29,8 +29,8 @@ function build_docker_images() {
function start_service() {
service_name="neo4j-apoc tei-embedding-serving tgi-gaudi-server dataprep-neo4j-llamaindex"
export host_ip=${ip_address}
export NEO4J_PORT1=7474 # 11631
export NEO4J_PORT2=7687 # 11632
export NEO4J_PORT1=11631
export NEO4J_PORT2=11632
export NEO4J_AUTH="neo4j/neo4jtest"
export NEO4J_URL="bolt://${ip_address}:${NEO4J_PORT2}"
export NEO4J_USERNAME="neo4j"
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_opensearch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT="11104"
export DATAPREP_PORT="11104"
OPENSEARCH_INITIAL_ADMIN_PASSWORD="StRoNgOpEa0)"
export TAG="comps"

Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_pgvector.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT="11105"
export DATAPREP_PORT="11105"
export TAG="comps"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_pinecone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -x

WORKPATH=$(dirname "$PWD")
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT="11106"
export DATAPREP_PORT="11106"
export TAG="comps"

function build_docker_images() {
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_qdrant.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT="11107"
export DATAPREP_PORT="11107"
TEI_EMBEDDER_PORT="10220"
export TAG="comps"
export DATA_PATH=${model_cache}
Expand Down
2 changes: 1 addition & 1 deletion tests/dataprep/test_dataprep_redis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
DATAPREP_PORT="11108"
export DATAPREP_PORT="11108"
TEI_EMBEDDER_PORT="10221"
export TAG="comps"
export DATA_PATH=${model_cache}
Expand Down
Loading