Skip to content
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3090178
add struct2graph comps
siddhivelankar23 Mar 31, 2025
2f51130
name changes
siddhivelankar23 Mar 31, 2025
89ddae4
remove extras
siddhivelankar23 Mar 31, 2025
1e1d4eb
add loggers
siddhivelankar23 Mar 31, 2025
24ea228
loggers
siddhivelankar23 Mar 31, 2025
8774410
add LOAD_FORMAT variable
siddhivelankar23 Mar 31, 2025
55f9746
add README and tests
siddhivelankar23 Apr 1, 2025
b31842f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2025
188bbef
updates
siddhivelankar23 Apr 1, 2025
734201e
add STRUCT2GRAPH constant
siddhivelankar23 Apr 1, 2025
eac59b5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2025
723d0a1
add reqs
siddhivelankar23 Apr 1, 2025
359184d
update path
siddhivelankar23 Apr 1, 2025
2166d15
add readme
siddhivelankar23 Apr 1, 2025
7341329
add workflow file
siddhivelankar23 Apr 1, 2025
0c8d205
update README
siddhivelankar23 Apr 1, 2025
0b985d9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2025
0516243
Merge remote-tracking branch 'upstream/main' into struct2graph
siddhivelankar23 Apr 2, 2025
a9812d9
review updates
siddhivelankar23 Apr 3, 2025
0720e3f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 3, 2025
fb87e5a
Merge remote-tracking branch 'upstream/main' into struct2graph
siddhivelankar23 Apr 3, 2025
ac6d284
Merge remote-tracking branch 'upstream/main' into struct2graph
siddhivelankar23 Apr 7, 2025
1c9fd82
update health check and graph db initialization
siddhivelankar23 Apr 7, 2025
b2107f6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 7, 2025
a6cbab9
Merge branch 'main' into struct2graph
lvliang-intel Apr 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/docker/compose/struct2graph-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# this file should be run in the root of the repo
services:
struct2graph:
build:
dockerfile: comps/struct2graph/src/Dockerfile
image: ${REGISTRY:-opea}/struct2graph:${TAG:-latest}
1 change: 1 addition & 0 deletions comps/cores/mega/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class ServiceType(Enum):
TEXT2SQL = 19
TEXT2GRAPH = 20
TEXT2CYPHER = 21
STRUCT2GRAPH = 23


class MegaServiceEndpoint(Enum):
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

include:
- ../../../third_parties/neo4j/deployment/docker_compose/compose.yaml

services:
struct2graph:
image: opea/struct2graph:latest
container_name: struct2graph
environment:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
- NEO4J_URL=${NEO4J_URL}
- NEO4J_server_directories_import=import
- NEO4J_PLUGINS=["apoc"]
- NEO4J_dbms_security_allow__csv__import__from__file__urls=true
- NEO4J_server_directories_import='/var/lib/neo4j/import'
- NEO4J_dbms_security_procedures_unrestricted=apoc.\\\* neo4j:5.23.0
ports:
- ${STRUCT2GRAPH_PORT:-8090}:8090
depends_on:
neo4j-apoc:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:7474"]
interval: 10s
timeout: 5s
retries: 10
start_period: 30s
ipc: host
network_mode: host
restart: always

networks:
default:
driver: bridge
37 changes: 37 additions & 0 deletions comps/struct2graph/src/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM ubuntu:22.04

WORKDIR /home/graph_extract

FROM python:3.11-slim
ENV LANG=C.UTF-8
ARG ARCH=cpu

RUN apt-get update -y && apt-get install vim -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/struct2graph/src/requirements.txt; \
else \
pip install --no-cache-dir -r /home/user/comps/struct2graph/src/requirements.txt; \
fi

ENV https_proxy=${https_proxy}
ENV http_proxy=${http_proxy}
ENV no_proxy=${no_proxy}
ENV PYTHONPATH="/home/user/":$PYTHONPATH

USER user

WORKDIR /home/user/comps/struct2graph/src/

ENTRYPOINT ["python", "opea_struct2graph_microservice.py"]
140 changes: 140 additions & 0 deletions comps/struct2graph/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# Struct2Graph Microservice

The Struct2Graph Microservice represents a powerful solution for transforming structured data formats like csv and json into Neo4j graph structures, serving as a crucial bridge between traditional data sources and modern graph-based systems. This process allows for enriching existing graphs, performing advanced data analysis, and constructing comprehensive knowledge graphs.
By importing structured data, users can integrate it into RAG flows, enhance querying capabilities to uncover patterns and relationships across large datasets. It's particularly useful for populating databases, creating hierarchical structures, and enabling cross-document querying. Furthermore, this approach supports data integration to provide a solid foundation for developing sophisticated graph-based applications that can exploit the rich relationships and properties inherent in graph data structures.

## Features

To convert structured data from CSV and JSON we provide the following interface -
Input:

```
{
"input_text": "string",
"task": "string",
"cypher_cmd": "string"
}
```

Output: Directory with results to query.

The task can be set to the following -

1. Index - generates index based on the cypher command (Output: Generated index)
2. Query - queries the index based on the input text (Output: Directory with results to query)

## Implementation

The struct2graph microservice is able to load and query structured data through neo4j.
The service is hosted in a docker. The mode of operation is through docker build + run or using docker compose.

## 🚀1. Start Microservice with docker run

### Install Requirements

```bash
pip install -r requirements.txt
```

### Export environment variables

```
cd comps/struct2graph/src/
source environment_setup.sh
```

OR

```
export https_proxy=${https_proxy}
export http_proxy=${http_proxy}
export no_proxy=${no_proxy}
export INDEX_NAME=${INDEX_NAME:-"graph_store"}
export PYTHONPATH="/home/user/"
export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
export NEO4J_URL=${NEO4J_URL:-"neo4j://neo4j-apoc:7687"}
export DATA_DIRECTORY=${DATA_DIRECTORY:-data}
export STRUCT2GRAPH_PORT=8090
export LOAD_FORMAT="CSV" # or JSON
```

### Launch Neo4j Service

```bash
docker run --restart always \
--publish=7474:7474 \
--publish=7687:7687 \
--name neo4j-apoc \
-e NEO4J_apoc_export_file_enabled=true \
-e NEO4J_apoc_import_file_enabled=true \
-e NEO4J_apoc_import_file_use__neo4j__config=true \
-e NEO4J_server_directories_import=import \
-e NEO4J_dbms_security_allow_csv_import_from_file_urls=true \
-e NEO4J_PLUGINS=\[\"apoc\"\] \
-e NEO4J_dbms_security_procedures_unrestricted=apoc.\\\* neo4j:5.23.0
```

### Verify the Neo4j Service

```bash
curl -v http://localhost:7474
```

If the Neo4j server is running correctly, the response should include an HTTP status code of 200 OK. Any other status code or an error message indicates that the server is not running or is not accessible. If the port 7474 is mapped to another port, you should change the port in the command accordingly.

### Start struct2graph Microservice with Docker

Command to build struct2graph microservice -

```bash
docker build -f Dockerfile -t opea/struct2graph:latest ../../../
```

Command to run struct2graph microservice -

```bash
docker run -i -t --net=host --ipc=host -p PORT opea/struct2graph:latest
```

The docker launches the struct2graph microservice interactively.

## 🚀2. Start Microservice with docker compose

Export environment variables as mentioned in option 1.

Command to run docker compose -

```bash
cd GenAIComps/tests/struct2graph/deployment/docker_compose

docker compose -f struct2graph-compose.yaml up
```

## 3. Validate the service using API endpoint

Example for "index" task -

```bash
curl -X POST http://localhost:$STRUCT2GRAPH_PORT/v1/struct2graph \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"input_text": "",
"task": "Index",
"cypher_cmd": "LOAD CSV WITH HEADERS FROM \'file:///$DATA_DIRECTORY/test1.csv\' AS row CREATE (:Person {ID: toInteger(row.ID), Name: row.Name, Age: toInteger(row.Age), City: row.City})"
}'
```

Example for "query" task -

```bash
curl -X POST http://localhost:$STRUCT2GRAPH_PORT/v1/struct2graph \
-H "accept: application/json" \
-H "Content-Type: application/json" \
-d '{
"input_text": "MATCH (p:Person {Name:\'Alice\'}) RETURN p",
"task": "Query",
"cypher_cmd": ""
}'
```
34 changes: 34 additions & 0 deletions comps/struct2graph/src/environment_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

#######################################################################
# Proxy
#######################################################################
export https_proxy=${https_proxy}
export http_proxy=${http_proxy}
export no_proxy=${no_proxy}
################################################################
# Configure LLM Parameters based on the model selected.
################################################################
export INDEX_NAME=${INDEX_NAME:-"graph_store"}
export PYTHONPATH="/home/user/"
export NEO4J_USERNAME=${NEO4J_USERNAME:-"neo4j"}
export NEO4J_PASSWORD=${NEO4J_PASSWORD:-"neo4j_password"}
export NEO4J_URL=${NEO4J_URL:-"neo4j://neo4j-apoc:7687"}
export DATA_DIRECTORY=${DATA_DIRECTORY:-data}
export FILENAME=${FILENAME:-test1.csv}
export LOAD_FORMAT=${LOAD_FORMAT:-"CSV"}


export CYPHER_CSV_CMD="LOAD CSV WITH HEADERS FROM 'file:////test1.csv' AS row \
CREATE (:Person {ID: toInteger(row.ID), Name: row.Name, Age: toInteger(row.Age), City: row.City});"
export CYPHER_JSON_CMD=" \
CALL apoc.load.json("file:///test1.json") YIELD value \
UNWIND value.table AS row \
CREATE (:Person { \
ID: row.ID, \
Name: row.Name, \
Age: row.Age, \
City: row.City \
}); \
"
98 changes: 98 additions & 0 deletions comps/struct2graph/src/integrations/graph_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import logging
import os

from langchain_neo4j import Neo4jGraph

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class PrepareGraphDB:
"""A class for preparing and saving a GraphDB."""

def __init__(self):
self.graph_store = self.neo4j_link()

def neo4j_link(self):
NEO4J_URL = os.getenv("NEO4J_URL")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")

if not all([NEO4J_URL, NEO4J_USERNAME, NEO4J_PASSWORD]):
raise EnvironmentError("Missing required Neo4j environment variables")

graph_store = Neo4jGraph(username=NEO4J_USERNAME, password=NEO4J_PASSWORD, url=NEO4J_URL)
return graph_store

def cleanup_neo4j(self):
try:
cypher = """MATCH (n) DETACH DELETE n"""
self.graph_store.query(cypher)

logger.info("## Existing graph_store schema...")
logger.info(self.graph_store.schema)

logger.info("Deleting all nodes...")
cypher = """MATCH (n) RETURN count(n)"""
result = self.graph_store.query(cypher)

logger.info("Dropping all constraints...")
for constraint in self.graph_store.query("SHOW CONSTRAINTS"):
self.graph_store.query(f"DROP CONSTRAINT {constraint['name']}")

logger.info("Dropping all indexes...")
for index in self.graph_store.query("SHOW INDEXES"):
logger.info(f"Removing index {index['name']}:")
self.graph_store.query(f"""DROP INDEX `{index['name']}`""")

logger.info("## Blank schema...")
self.graph_store.refresh_schema()
logger.info(self.graph_store.schema)
return

except Exception as e:
logger.error(f"Failed to cleanup Neo4j database: {str(e)}")
raise

def load_graphdb(self, cypher_cmd):
LOAD_FORMAT = os.getenv("LOAD_FORMAT", "CSV")

try:
if LOAD_FORMAT == "CSV":
cypher_csv_insert = cypher_cmd
logger.info(f"INSERTING CSV Cypher command : {cypher_csv_insert}")
logger.info("Preparing graphdb...")
self.graph_store.query(cypher_csv_insert)
logger.info("GraphDB is created and saved.")

elif LOAD_FORMAT == "JSON":
cypher_json_insert = cypher_cmd
logger.info(f"INSERTING JSON Cypher command : {cypher_json_insert}")
self.graph_store.query(cypher_json_insert)
logger.info(f"The following is the graph schema \n\n {self.graph_store.schema}")
logger.info("GraphDB is created and saved.")

else:
logger.error("Only CSV and JSON formats are supported")
raise ValueError("Only CSV and JSON formats are supported")

logger.info("Preparing graphdb...")
return self.graph_store

except NameError:
raise ValueError("Error: The variable CYPHER_CSV_CMD is not set.")

def prepare_insert_graphdb(self, cypher_cmd):
logger.info("Cleaning up graph db")
self.cleanup_neo4j()
logger.info("Done cleaning up graph db")
self.load_graphdb(cypher_cmd)
logger.info("Completed inserting into graphdb")
logger.info(f"The following is the graph schema \n\n {self.graph_store.schema}")
logger.info("Preparing graphdb...")
logger.info("GraphDB is created and saved.")
return self.graph_store
Loading
Loading