Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/docker/compose/text2kg-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# this file should be run in the root of the repo
services:
text2kg:
build:
dockerfile: comps/text2kg/src/Dockerfile
image: ${REGISTRY:-opea}/text2kg:${TAG:-latest}
1 change: 1 addition & 0 deletions comps/cores/mega/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class ServiceType(Enum):
TEXT2SQL = 19
TEXT2GRAPH = 20
TEXT2CYPHER = 21
TEXT2KG = 22


class MegaServiceEndpoint(Enum):
Expand Down
Empty file.
29 changes: 29 additions & 0 deletions comps/text2kg/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

include:
- ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
- ../../../third_parties/neo4j/deployment/docker_compose/compose.yaml

services:
text2kg:
image: ${REGISTRY:-opea}/text2kg:${TAG:-latest}
container_name: text2kg
ports:
- ${TEXT2KG_PORT:-8090}:8090
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
- HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
depends_on:
- tgi-server
- neo4j-apoc
ipc: host
network_mode: "host"
restart: always

networks:
default:
driver: bridge
5 changes: 5 additions & 0 deletions comps/text2kg/deployment/docker_compose/custom-override.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
services:
tgi-server:
runtime: runc
53 changes: 53 additions & 0 deletions comps/text2kg/src/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM ubuntu:22.04

WORKDIR /home/graph_extract

FROM python:3.11-slim
ENV LANG=C.UTF-8
ARG ARCH=cpu

RUN apt-get update -y && apt-get install vim wget -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

COPY comps /home/user/comps

RUN mkdir -p /home/users/comps/text2kg/src/tmpddata && cd /home/users/comps/text2kg/src/tmpddata
RUN wget https://gist.githubusercontent.com/wey-gu/75d49362d011a0f0354d39e396404ba2/raw/0844351171751ebb1ce54ea62232bf5e59445bb7/paul_graham_essay.txt

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/text2kg/src/requirements.txt; \
else \
pip install --no-cache-dir -r /home/user/comps/text2kg/src/requirements.txt; \
fi

ENV https_proxy=${https_proxy}
ENV http_proxy=${http_proxy}
ENV no_proxy=${no_proxy}
ENV LLM_ID=${LLM_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
ENV SPAN_LENGTH=${SPAN_LENGTH:-"1024"}
ENV OVERLAP=${OVERLAP:-"100"}
ENV MAX_LENGTH=${MAX_NEW_TOKENS:-"256"}
ENV HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN}
ENV HF_TOKEN=${HF_TOKEN}
ENV LLM_MODEL_ID=${LLM_ID}
ENV TGI_PORT=8008
ENV PYTHONPATH="/home/user/":$PYTHONPATH

USER user

WORKDIR /home/user/comps/text2kg/src/


WORKDIR /home/user/comps/text2kg/src/

RUN bash -c 'source /home/user/comps/text2kg/src/environment_setup.sh'

ENTRYPOINT ["python", "opea_text2kg_microservice.py"]
155 changes: 155 additions & 0 deletions comps/text2kg/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
# Text to knowledge graph (text2kg) microservice

Text to Knowledge Graph (text2kg) Microservice enables the conversion of unstructured text into structured data by generating graph triplets. This process, which can be complex, has become more accessible with the rise of Large Language Models (LLMs), making it a mainstream solution for data extraction tasks. We are using a decoder-only model for this application's purpose.
This microservice can be run on cpu or hpu and instructions for the same are mentioned below.

## Decoder-Only Models

Decoder-only models are optimized for fast inference by skipping the encoding step. They work well for tasks where input-output mappings are relatively simple, or when multitasking is required. These models are ideal when computational efficiency and prompt-based output generation are priorities. However, decoder-only models may struggle with tasks that require deep contextual understanding or when input-output structures are highly complex or varied.

## Features

Input Formats: Accepts text from documents, text files, or strings\*.

Output: Answer to the query asked by the user.

## 🚀 1. Start individual microservices using docker cli (Option 1)

Update the environment_setup.sh file with your device and user information, and source it using -

```bash
source comps/text2kg/src/environment_setup.sh
```

If you skip this step, you can export variables related to individual services as mentioned in each of the microservices.

### 1. TGI

#### a. Start the TGI microservice

```bash
(you can skip this part if you have sourced your environment_setup file already)

export TGI_PORT=8008
export HF_TOKEN=${HF_TOKEN}
export LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
export LLM_ENDPOINT_PORT=${LLM_ENDPOINT_PORT:-"9001"}
export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"
export PYTHONPATH="/home/user/"
```

```bash
docker run -d --name="text2graph-tgi-endpoint" --ipc=host -p $TGI_PORT:80 -v ./data:/data --shm-size 1g -e HF_TOKEN=${HF_TOKEN} -e model=${LLM_MODEL_ID} ghcr.io/huggingface/text-generation-inference:2.1.0 --model-id $LLM_MODEL_ID
```

#### b. Verify the TGI microservice

```bash
export your_ip=$(hostname -I | awk '{print $1}')
curl http://${your_ip}:${TGI_PORT}/generate \
-X POST \
-d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
-H 'Content-Type: application/json'
```

### 2. Neo4J

#### a. Download Neo4J image

```bash
docker pull neo4j:latest
```

#### b. Configure the username, password, dbname, and other neo4j relational variables based on your data (this is an example)

```bash
(you can skip this part if you have sourced your environment_setup file already)
export NEO4J_AUTH=neo4j/password
export NEO4J_PLUGINS=\[\"apoc\"\]
export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
export NEO4J_PORT1={$NEO4J_PORT1:-7474}:7474
export NEO4J_PORT2={$NEO4J_PORT2:-7687}:7687
export NEO4J_URL=${NEO4J_URL:-"neo4j://localhost:7687"}
export NEO4J_URI=${NEO4J_URI:-"neo4j://localhost:7687"}

export DATA_DIRECTORY=$(pwd)
export ENTITIES="PERSON,PLACE,ORGANIZATION"
export RELATIONS="HAS,PART_OF,WORKED_ON,WORKED_WITH,WORKED_AT"
export VALIDATION_SCHEMA='{
"PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
"PLACE": ["HAS", "PART_OF", "WORKED_AT"],
"ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"]
}'
```

#### c. Run Neo4J service

Launch the database with the following docker command.

```bash
docker run \
-p 7474:7474 -p 7687:7687 \
-v $PWD/data:/data -v $PWD/plugins:/plugins \
--name neo4j-apoc \
-d \
-e NEO4J_AUTH=neo4j/password \
-e NEO4J_PLUGINS=\[\"apoc\"\] \
neo4j:latest
```

### 3. Text2kg

```bash
cd comps/text2kg/src/
export TEXT2KG_PORT=8090
```

Build the text2kg docker image

```bash
docker build -f Dockerfile -t opea/text2kg:latest ../../../
```

Launch the docker container

```bash
docker run -i -t --net=host --ipc=host -p TEXT2KG_PORT opea/text2kg:latest -v data:/home/user/comps/text2kg/src/data /bin/bash
```

## 🚀 2. Start text2kg and dependent microservices with docker-compose (Option 2)

```bash
cd comps/text2kg/deployment/docker_compose/
```

Export service name and log path

```bash
export service_name="text2kg"
export LOG_PATH=$PWD
```

Export NEO4J variables - refer to section 1.2.b.
Launch using the following command to run on cpu

```bash
docker compose -f compose.yaml -f custom-override.yml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
```

Launch using the following command to run on gaudi

```bash
docker compose -f compose.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
```

## 3. Check the service using API endpoint

```bash
curl -X 'POST' \
'http://localhost:TEXT2KG_PORT/v1/text2kg?input_text=Who%20is%20paul%20graham%3F' \
-H 'accept: application/json' \
-d ''
```

- Make sure your input document/string has the necessary information that can be extracted.
42 changes: 42 additions & 0 deletions comps/text2kg/src/environment_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#!/bin/bash
#######################################################################
# Proxy
#######################################################################
export https_proxy=${https_proxy}
export http_proxy=${http_proxy}
export no_proxy=${no_proxy}
export your_ip=${your_ip}
################################################################
# Configure LLM Parameters based on the model selected.
################################################################

export HF_TOKEN=${HF_TOKEN}

export LLM_ID=${LLM_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
export LLM_MODEL_ID=${LLM_MODEL_ID:-"HuggingFaceH4/zephyr-7b-alpha"}
export LLM_ENDPOINT_PORT=${LLM_ENDPOINT_PORT:-"9001"}

export TGI_PORT=8008
export PYTHONPATH="/home/user/"
export TGI_LLM_ENDPOINT="http://${your_ip}:${TGI_PORT}"

export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
export NEO4J_URL=${NEO4J_URL:-"neo4j://localhost:7687"}
export NEO4J_URI=${NEO4J_URI:-"neo4j://localhost:7687"}
export NEO4J_PORT1={$NEO4J_PORT1:-7474}:7474
export NEO4J_PORT2={$NEO4J_PORT2:-7687}:7687
export NEO4J_AUTH=neo4j/password
export NEO4J_PLUGINS=\[\"apoc\"\]

export DATA_DIRECTORY=$(pwd)
export ENTITIES="PERSON,PLACE,ORGANIZATION"
export RELATIONS="HAS,PART_OF,WORKED_ON,WORKED_WITH,WORKED_AT"
export VALIDATION_SCHEMA='{
"PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
"PLACE": ["HAS", "PART_OF", "WORKED_AT"],
"ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"]
}'
export TEXT2KG_PORT=8090
Loading
Loading