Skip to content

Commit 6504b4f

Browse files
Merge main into person/hduygu/confluence-reader-example
2 parents e7c48f1 + 09c49ba commit 6504b4f

File tree

7 files changed

+210
-0
lines changed

7 files changed

+210
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
import csv
4+
import logging
5+
import pathlib
6+
7+
from config import DOCUMENTS_CSV_FILE_LOCATION
8+
from langchain_community.document_loaders import ConfluenceLoader
9+
from vdk.api.job_input import IJobInput
10+
11+
log = logging.getLogger(__name__)
12+
13+
14+
def fetch_confluence_space(url, token, space_key):
15+
try:
16+
# For more info regarding the LangChain ConfluenceLoader:
17+
# https://python.langchain.com/docs/integrations/document_loaders/confluence
18+
loader = ConfluenceLoader(url=url, token=token)
19+
documents = loader.load(
20+
space_key=space_key, include_attachments=True, limit=50, max_pages=50
21+
)
22+
return documents
23+
except Exception as e:
24+
log.error(f"Error fetching documents from Confluence: {e}")
25+
return None
26+
27+
28+
def write_documents_to_csv(documents, filename):
29+
with open(filename, mode="w", newline="", encoding="utf-8") as file:
30+
writer = csv.writer(file)
31+
for doc in documents:
32+
writer.writerow([doc.page_content])
33+
34+
35+
def run(job_input: IJobInput):
36+
log.info(f"Starting job step {__name__}")
37+
38+
confluence_url = job_input.get_property(
39+
"confluence_url", "https://yoansalambashev.atlassian.net/"
40+
)
41+
# since the Confluence space is public, no need to generate API token
42+
token = ""
43+
space_key = job_input.get_property("space_key", "RESEARCH")
44+
data_job_dir = pathlib.Path(job_input.get_job_directory())
45+
output_csv = data_job_dir / DOCUMENTS_CSV_FILE_LOCATION
46+
47+
docs = fetch_confluence_space(confluence_url, token, space_key)
48+
49+
if docs:
50+
log.info(f"{len(docs)} documents fetched successfully.")
51+
write_documents_to_csv(docs, output_csv)
52+
log.info(f"Documents written to {output_csv}")
53+
else:
54+
log.error(f"Failed to fetch any documents from the space with key {space_key}.")
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
import csv
4+
import logging
5+
import pathlib
6+
import re
7+
8+
import nltk
9+
from config import DOCUMENTS_CSV_FILE_LOCATION
10+
from config import EMBEDDINGS_PKL_FILE_LOCATION
11+
from nltk.corpus import stopwords
12+
from nltk.stem import WordNetLemmatizer
13+
from sentence_transformers import SentenceTransformer
14+
from vdk.api.job_input import IJobInput
15+
16+
log = logging.getLogger(__name__)
17+
18+
19+
def clean_text(text):
20+
"""
21+
Prepares text for NLP tasks (embedding and RAG) by standardizing its form. It focuses on retaining
22+
meaningful words and achieving consistency in their representation. This involves
23+
converting to lowercase (uniformity), removing punctuation and stopwords
24+
(focusing on relevant words), and lemmatization (reducing words to their base form).
25+
Such preprocessing is crucial for effective NLP analysis.
26+
27+
:param text: A string containing the text to be processed.
28+
:return: The processed text as a string.
29+
"""
30+
text = text.lower()
31+
# remove punctuation and special characters
32+
text = re.sub(r"[^\w\s]", "", text)
33+
# remove stopwords and lemmatize
34+
stop_words = set(stopwords.words("english"))
35+
lemmatizer = WordNetLemmatizer()
36+
text = " ".join(
37+
[lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
38+
)
39+
return text
40+
41+
42+
def load_and_clean_documents(filename):
43+
cleaned_documents = []
44+
with open(filename, encoding="utf-8") as file:
45+
reader = csv.reader(file)
46+
next(reader, None)
47+
for row in reader:
48+
if row:
49+
cleaned_text = clean_text(row[0])
50+
cleaned_documents.append([cleaned_text])
51+
return cleaned_documents
52+
53+
54+
def save_cleaned_documents(cleaned_documents, output_file):
55+
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
56+
writer = csv.writer(file)
57+
writer.writerows(cleaned_documents)
58+
59+
60+
def embed_documents_in_batches(documents):
61+
# the model card: https://huggingface.co/sentence-transformers/all-mpnet-base-v2
62+
model = SentenceTransformer("all-mpnet-base-v2")
63+
total = len(documents)
64+
log.info(f"total: {total}")
65+
embeddings = []
66+
for start_index in range(0, total):
67+
# the resources are not enough to batch 2 documents at a time, so the batch = 1 doc
68+
batch = documents[start_index]
69+
log.info(f"BATCH: {len(batch)}.")
70+
embeddings.extend(model.encode(batch, show_progress_bar=True))
71+
return embeddings
72+
73+
74+
def run(job_input: IJobInput):
75+
log.info(f"Starting job step {__name__}")
76+
77+
input_csv = DOCUMENTS_CSV_FILE_LOCATION
78+
# output_cleaned_csv = 'documents_cleaned.csv'
79+
data_job_dir = pathlib.Path(job_input.get_job_directory())
80+
output_embeddings = data_job_dir / EMBEDDINGS_PKL_FILE_LOCATION
81+
82+
# create a temporary (until the end of the job execution) dir with
83+
# write permissions to store the relevant nltk dependencies
84+
temp_dir = job_input.get_temporary_write_directory()
85+
nltk_data_path = temp_dir / "nltk_data"
86+
nltk_data_path.mkdir(exist_ok=True)
87+
nltk.data.path.append(str(nltk_data_path))
88+
89+
nltk.download("stopwords", download_dir=str(nltk_data_path))
90+
nltk.download("wordnet", download_dir=str(nltk_data_path))
91+
92+
cleaned_documents = load_and_clean_documents(input_csv)
93+
if cleaned_documents:
94+
log.info(
95+
f"{len(cleaned_documents)} documents loaded and cleaned for embedding."
96+
)
97+
# save_cleaned_documents(cleaned_documents, output_cleaned_csv)
98+
# log.info(f"Cleaned documents saved to {output_cleaned_csv}")
99+
embeddings = embed_documents_in_batches(cleaned_documents)
100+
with open(output_embeddings, "wb") as file:
101+
import pickle
102+
103+
pickle.dump(embeddings, file)
104+
log.info(f"Embeddings saved to {output_embeddings}")
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Fetch And Embed Data Job Example
2+
3+
The following Versatile Data Kit example allows you to fetch the data from public Confluence space and embed it.
4+
5+
# Fetch Confluence Data
6+
The data is fetched in [10_fetch_confluence_space.py](./10_fetch_confluence_space.py) using the
7+
[LangChain's ConfluenceLoader](https://python.langchain.com/docs/integrations/document_loaders/confluence).
8+
9+
# Create embeddings for the data
10+
The fetched data from the previous step is read, cleaned and embedded using the
11+
[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) HuggingFace SentenceTransformer Embedding model:
12+
13+
# Run the example
14+
To run the data job locally:
15+
```bash
16+
vdk run fetch-embed-job-example
17+
```
18+
19+
To open the output embeddings pickle file, use:
20+
21+
```python
22+
import pandas as pd
23+
24+
obj = pd.read_pickle(r'embeddings.pkl')
25+
```
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[owner]
2+
team = my-team
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
DOCUMENTS_CSV_FILE_LOCATION = "documents.csv"
5+
EMBEDDINGS_PKL_FILE_LOCATION = "embeddings.pkl"

0 commit comments

Comments
 (0)