opea-project · letonghan · Apr 11, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# this file should be run in the root of the repo
+services:
+  text2kg:
+    build:
+      dockerfile: comps/text2kg/src/Dockerfile
+    image: ${REGISTRY:-opea}/text2kg:${TAG:-latest}
@@ -36,6 +36,7 @@ class ServiceType(Enum):
     TEXT2SQL = 19
     TEXT2GRAPH = 20
     TEXT2CYPHER = 21
+    TEXT2KG = 22
 
 
 class MegaServiceEndpoint(Enum):

@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+include:
+  - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
+  - ../../../third_parties/neo4j/deployment/docker_compose/compose.yaml
+
+services:
+  text2kg:
+     image: ${REGISTRY:-opea}/text2kg:${TAG:-latest}
+     container_name: text2kg
+     ports:
+         - ${TEXT2KG_PORT:-8090}:8090
+     environment:
+         - no_proxy=${no_proxy}
+         - https_proxy=${https_proxy}
+         - http_proxy=${http_proxy}
+         - LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
+         - HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+     depends_on:
+         - tgi-server
+         - neo4j-apoc
+     ipc: host
+     network_mode: "host"
+     restart: always
+
+networks:
+  default:
+    driver: bridge
@@ -0,0 +1,5 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+services:
+  tgi-server:
+    runtime: runc
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM ubuntu:22.04
+
+WORKDIR /home/graph_extract
+
+FROM python:3.11-slim
+ENV LANG=C.UTF-8
+ARG ARCH=cpu
+
+RUN apt-get update -y && apt-get install vim wget -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential 
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/ 
+
+COPY comps /home/user/comps
+
+RUN mkdir -p /home/users/comps/text2kg/src/tmpddata && cd /home/users/comps/text2kg/src/tmpddata
+RUN wget https://gist.githubusercontent.com/wey-gu/75d49362d011a0f0354d39e396404ba2/raw/0844351171751ebb1ce54ea62232bf5e59445bb7/paul_graham_essay.txt
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    if [ ${ARCH} = "cpu" ]; then \
+        pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/text2kg/src/requirements.txt; \
+    else \
+        pip install --no-cache-dir -r /home/user/comps/text2kg/src/requirements.txt; \
+    fi
+
+ENV https_proxy=${https_proxy}
+ENV http_proxy=${http_proxy}
+ENV no_proxy=${no_proxy}
+ENV LLM_ID=${LLM_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
+ENV SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
+ENV OVERLAP=${OVERLAP:-"100"}
+ENV MAX_LENGTH=${MAX_NEW_TOKENS:-"256"}
+ENV HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
+ENV HF_TOKEN=${HF_TOKEN}
+ENV LLM_MODEL_ID=${LLM_ID}
+ENV TGI_PORT=8008
+ENV PYTHONPATH="/home/user/":$PYTHONPATH
+
+USER user
+
+WORKDIR /home/user/comps/text2kg/src/
+
+
+WORKDIR /home/user/comps/text2kg/src/
+
+RUN bash -c 'source /home/user/comps/text2kg/src/environment_setup.sh'
+
+ENTRYPOINT ["python", "opea_text2kg_microservice.py"]
@@ -0,0 +1,155 @@
+# Text to knowledge graph (text2kg) microservice
+
+Text to Knowledge Graph (text2kg) Microservice enables the conversion of unstructured text into structured data by generating graph triplets. This process, which can be complex, has become more accessible with the rise of Large Language Models (LLMs), making it a mainstream solution for data extraction tasks. We are using a decoder-only model for this application's purpose.
+This microservice can be run on cpu or hpu and instructions for the same are mentioned below.
+
+## Decoder-Only Models
+
+Decoder-only models are optimized for fast inference by skipping the encoding step. They work well for tasks where input-output mappings are relatively simple, or when multitasking is required. These models are ideal when computational efficiency and prompt-based output generation are priorities. However, decoder-only models may struggle with tasks that require deep contextual understanding or when input-output structures are highly complex or varied.
+
+## Features
+
+Input Formats: Accepts text from documents, text files, or strings\*.
+
+Output: Answer to the query asked by the user.
+
+## 🚀 1. Start individual microservices using docker cli (Option 1)
+
+Update the environment_setup.sh file with your device and user information, and source it using -
+
+```bash
+source comps/text2kg/src/environment_setup.sh
+```
+
+If you skip this step, you can export variables related to individual services as mentioned in each of the microservices.
+
+### 1. TGI
+
+#### a. Start the TGI microservice
+
+```bash
+(you can skip this part if you have sourced your environment_setup file already)
+
+export TGI_PORT=8008
+export HF_TOKEN=${HF_TOKEN}
+export LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
+export LLM_ENDPOINT_PORT=${LLM_ENDPOINT_PORT:-"9001"}
+export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"
+export PYTHONPATH="/home/user/"
+```
+
+```bash
+docker run -d --name="text2graph-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HF_TOKEN=${HF_TOKEN} -e model=${LLM_MODEL_ID} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $LLM_MODEL_ID
+```
+
+#### b. Verify the TGI microservice
+
+```bash
+export your_ip=$(hostname -I | awk '{print $1}')
+curl http://${your_ip}:${TGI_PORT}/generate \
+  -X POST \
+  -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+  -H 'Content-Type: application/json'
+```
+
+### 2. Neo4J
+
+#### a. Download Neo4J image
+
+```bash
+docker pull neo4j:latest
+```
+
+#### b. Configure the username, password, dbname, and other neo4j relational variables based on your data (this is an example)
+
+```bash
+(you can skip this part if you have sourced your environment_setup file already)
+export NEO4J_AUTH=neo4j/password
+export NEO4J_PLUGINS=\[\"apoc\"\]
+export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
+export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
+export NEO4J_PORT1={$NEO4J_PORT1:-7474}:7474
+export NEO4J_PORT2={$NEO4J_PORT2:-7687}:7687
+export NEO4J_URL=${NEO4J_URL:-"neo4j://localhost:7687"}
+export NEO4J_URI=${NEO4J_URI:-"neo4j://localhost:7687"}
+
+export DATA_DIRECTORY=$(pwd)
+export ENTITIES="PERSON,PLACE,ORGANIZATION"
+export RELATIONS="HAS,PART_OF,WORKED_ON,WORKED_WITH,WORKED_AT"
+export VALIDATION_SCHEMA='{
+    "PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
+    "PLACE": ["HAS", "PART_OF", "WORKED_AT"],
+    "ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"]
+}'
+```
+
+#### c. Run Neo4J service
+
+Launch the database with the following docker command.
+
+```bash
+docker run \
+    -p 7474:7474 -p 7687:7687 \
+    -v $PWD/data:/data -v $PWD/plugins:/plugins \
+    --name neo4j-apoc \
+    -d \
+    -e NEO4J_AUTH=neo4j/password \
+    -e NEO4J_PLUGINS=\[\"apoc\"\]  \
+    neo4j:latest
+```
+
+### 3. Text2kg
+
+```bash
+cd comps/text2kg/src/
+export TEXT2KG_PORT=8090
+```
+
+Build the text2kg docker image
+
+```bash
+docker build -f Dockerfile -t opea/text2kg:latest ../../../
+```
+
+Launch the docker container
+
+```bash
+docker run -i -t --net=host --ipc=host -p TEXT2KG_PORT opea/text2kg:latest -v data:/home/user/comps/text2kg/src/data /bin/bash
+```
+
+## 🚀 2. Start text2kg and dependent microservices with docker-compose (Option 2)
+
+```bash
+cd comps/text2kg/deployment/docker_compose/
+```
+
+Export service name and log path
+
+```bash
+export service_name="text2kg"
+export LOG_PATH=$PWD
+```
+
+Export NEO4J variables - refer to section 1.2.b.
+Launch using the following command to run on cpu
+
+```bash
+docker compose -f compose.yaml -f custom-override.yml up ${service_name}  -d > ${LOG_PATH}/start_services_with_compose.log
+```
+
+Launch using the following command to run on gaudi
+
+```bash
+docker compose -f compose.yaml up ${service_name}  -d > ${LOG_PATH}/start_services_with_compose.log
+```
+
+## 3. Check the service using API endpoint
+
+```bash
+curl -X 'POST' \
+  'http://localhost:TEXT2KG_PORT/v1/text2kg?input_text=Who%20is%20paul%20graham%3F' \
+  -H 'accept: application/json' \
+  -d ''
+```
+
+- Make sure your input document/string has the necessary information that can be extracted.
@@ -0,0 +1,42 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#!/bin/bash
+#######################################################################
+# Proxy
+#######################################################################
+export https_proxy=${https_proxy}
+export http_proxy=${http_proxy}
+export no_proxy=${no_proxy}
+export your_ip=${your_ip}
+################################################################
+# Configure LLM Parameters based on the model selected.
+################################################################
+
+export HF_TOKEN=${HF_TOKEN}
+
+export LLM_ID=${LLM_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
+export LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
+export LLM_ENDPOINT_PORT=${LLM_ENDPOINT_PORT:-"9001"}
+
+export TGI_PORT=8008
+export PYTHONPATH="/home/user/"
+export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"
+
+export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
+export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
+export NEO4J_URL=${NEO4J_URL:-"neo4j://localhost:7687"}
+export NEO4J_URI=${NEO4J_URI:-"neo4j://localhost:7687"}
+export NEO4J_PORT1={$NEO4J_PORT1:-7474}:7474
+export NEO4J_PORT2={$NEO4J_PORT2:-7687}:7687
+export NEO4J_AUTH=neo4j/password
+export NEO4J_PLUGINS=\[\"apoc\"\]
+
+export DATA_DIRECTORY=$(pwd)
+export ENTITIES="PERSON,PLACE,ORGANIZATION"
+export RELATIONS="HAS,PART_OF,WORKED_ON,WORKED_WITH,WORKED_AT"
+export VALIDATION_SCHEMA='{
+    "PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
+    "PLACE": ["HAS", "PART_OF", "WORKED_AT"],
+    "ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"]
+}'
+export TEXT2KG_PORT=8090