Skip to content

Commit 5388bed

Browse files
authored
v1.7 - IMPORTANT
Removed releases between 1.4.2 and 1.6 because I didn't like the changes, especially the newer syntax for ChromaDB. Specifically, the newer version of ChromaDB somehow has a limit of 6456 chunks, which prevents ingesting very large PDF document. The only thing I could think of is that the newer ChromaDB relies on Sqlite4 while the older version relies on DuckDB + Parquest/Clickhouse...So rather than figure out how to make the newer ChromaDB work, reverted, works fine. This release includes all prior improvements from releases I deleted, except the newer ChromaDB version. Thus, new GUI, new layout, refactoring of scripts in preparation for expansion, etc.
1 parent 1112ec8 commit 5388bed

12 files changed

+816
-0
lines changed

check_gpu.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import torch
2+
import tkinter as tk
3+
from tkinter import messagebox
4+
5+
# Create a function to display the information in a popup window
6+
def display_info():
7+
info_message = ""
8+
9+
if torch.cuda.is_available():
10+
info_message += "CUDA is available!\n"
11+
info_message += "CUDA version: {}\n\n".format(torch.version.cuda)
12+
else:
13+
info_message += "CUDA is not available.\n\n"
14+
15+
if torch.backends.mps.is_available():
16+
info_message += "Metal/MPS is available!\n\n"
17+
else:
18+
info_message += "Metal/MPS is not available.\n\n"
19+
20+
info_message += "If you want to check the version of Metal and MPS on your macOS device, you can go to \"About This Mac\" -> \"System Report\" -> \"Graphics/Displays\" and look for information related to Metal and MPS.\n\n"
21+
22+
if torch.version.hip is not None:
23+
info_message += "ROCm is available!\n"
24+
info_message += "ROCm version: {}\n".format(torch.version.hip)
25+
else:
26+
info_message += "ROCm is not available.\n"
27+
28+
# Create a small window to display the information
29+
root = tk.Tk()
30+
root.withdraw() # Hide the main window
31+
messagebox.showinfo("System Information", info_message)
32+
root.destroy() # Close the hidden main window when the popup is closed
33+
34+
# Call the display_info function to show the information in a window
35+
display_info()

config.yaml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
AVAILABLE_MODELS:
2+
- BAAI/bge-large-en-v1.5
3+
- BAAI/bge-base-en-v1.5
4+
- BAAI/bge-small-en-v1.5
5+
- BAAI/bge-large-en
6+
- BAAI/bge-base-en
7+
- BAAI/bge-small-en
8+
- thenlper/gte-large
9+
- thenlper/gte-base
10+
- thenlper/gte-small
11+
- intfloat/e5-large-v2
12+
- intfloat/multilingual-e5-large
13+
- intfloat/e5-base-v2
14+
- intfloat/multilingual-e5-base
15+
- intfloat/e5-small-v2
16+
- intfloat/multilingual-e5-small
17+
- hkunlp/instructor-xl
18+
- hkunlp/instructor-large
19+
- hkunlp/instructor-base
20+
- sentence-transformers/all-mpnet-base-v2
21+
- sentence-transformers/all-MiniLM-L12-v2
22+
- sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
23+
- sentence-transformers/all-MiniLM-L6-v2
24+
- sentence-transformers/multi-qa-MiniLM-L6-cos-v1
25+
- sentence-transformers/sentence-t5-xl
26+
- sentence-transformers/sentence-t5-large
27+
- sentence-transformers/sentence-t5-base
28+
- sentence-transformers/gtr-t5-xl
29+
- sentence-transformers/gtr-t5-large
30+
- sentence-transformers/gtr-t5-base
31+
COMPUTE_DEVICE: cuda
32+
DOCUMENT_MAP:
33+
.csv: UnstructuredCSVLoader
34+
.docx: Docx2txtLoader
35+
.eml: UnstructuredEmailLoader
36+
.enex: EverNoteLoader
37+
.json: JSONLoader
38+
.msg: UnstructuredEmailLoader
39+
.pdf: PDFMinerLoader
40+
.txt: TextLoader
41+
.xls: UnstructuredExcelLoader
42+
.xlsx: UnstructuredExcelLoader
43+
EMBEDDING_MODEL_NAME: C:/PATH/Scripts/LM Search Vector Database/LM Search Vector Database_v1_5
44+
- working/Embedding_Models/hkunlp--instructor-xl

document_chunker.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from langchain.text_splitter import RecursiveCharacterTextSplitter
2+
3+
def split_documents(documents):
4+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=400)
5+
texts = text_splitter.split_documents(documents)
6+
7+
return texts

document_loader.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
from concurrent.futures import ThreadPoolExecutor, as_completed
3+
from concurrent.futures import ProcessPoolExecutor
4+
5+
from langchain.docstore.document import Document
6+
from langchain.document_loaders import (
7+
PDFMinerLoader,
8+
Docx2txtLoader,
9+
TextLoader,
10+
JSONLoader,
11+
EverNoteLoader,
12+
UnstructuredEmailLoader,
13+
UnstructuredCSVLoader,
14+
UnstructuredExcelLoader
15+
)
16+
17+
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
18+
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/Docs_for_DB"
19+
INGEST_THREADS = os.cpu_count() or 8
20+
21+
DOCUMENT_MAP = {
22+
".pdf": PDFMinerLoader,
23+
".docx": Docx2txtLoader,
24+
".txt": TextLoader,
25+
".json": JSONLoader,
26+
".enex": EverNoteLoader,
27+
".eml": UnstructuredEmailLoader,
28+
".msg": UnstructuredEmailLoader,
29+
".csv": UnstructuredCSVLoader,
30+
".xls": UnstructuredExcelLoader,
31+
".xlsx": UnstructuredExcelLoader,
32+
}
33+
34+
def load_single_document(file_path: str) -> Document:
35+
file_extension = os.path.splitext(file_path)[1]
36+
loader_class = DOCUMENT_MAP.get(file_extension)
37+
if loader_class:
38+
loader = loader_class(file_path)
39+
else:
40+
raise ValueError("Document type is undefined")
41+
return loader.load()[0]
42+
43+
def load_document_batch(filepaths):
44+
45+
with ThreadPoolExecutor(len(filepaths)) as exe:
46+
futures = [exe.submit(load_single_document, name) for name in filepaths]
47+
data_list = [future.result() for future in futures]
48+
49+
return (data_list, filepaths)
50+
51+
def load_documents(source_dir: str) -> list[Document]:
52+
53+
all_files = os.listdir(source_dir)
54+
paths = [os.path.join(source_dir, file_path) for file_path in all_files if os.path.splitext(file_path)[1] in DOCUMENT_MAP.keys()]
55+
56+
n_workers = min(INGEST_THREADS, max(len(paths), 1))
57+
chunksize = round(len(paths) / n_workers)
58+
docs = []
59+
60+
with ProcessPoolExecutor(n_workers) as executor:
61+
futures = [executor.submit(load_document_batch, paths[i : (i + chunksize)]) for i in range(0, len(paths), chunksize)]
62+
for future in as_completed(futures):
63+
contents, _ = future.result()
64+
docs.extend(contents)
65+
66+
return docs
67+
68+
if __name__ == "__main__":
69+
load_documents(SOURCE_DIRECTORY)

gui.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import tkinter as tk
2+
from tkinter import font as tkfont
3+
import threading
4+
import torch
5+
import yaml
6+
from gui_table import create_table, create_pro_tip
7+
from metrics_gpu import GPU_Monitor
8+
from metrics_system import SystemMonitor
9+
import platform
10+
11+
METRICS_STYLE = {
12+
'font': ("Segoe UI Semibold", 12),
13+
'bg': "#202123"
14+
}
15+
BUTTON_STYLE = {
16+
'width': 29,
17+
'bg': "#323842",
18+
'fg': "light gray",
19+
'font': ("Segoe UI Historic", 10)
20+
}
21+
FRAME_STYLE = {
22+
'highlightthickness': 0,
23+
'relief': 'flat',
24+
'borderwidth': 0
25+
}
26+
27+
def determine_compute_device():
28+
if torch.cuda.is_available():
29+
COMPUTE_DEVICE = "cuda"
30+
elif torch.backends.mps.is_available():
31+
COMPUTE_DEVICE = "mps"
32+
else:
33+
COMPUTE_DEVICE = "cpu"
34+
35+
with open("config.yaml", 'r') as stream:
36+
config_data = yaml.safe_load(stream)
37+
38+
config_data['COMPUTE_DEVICE'] = COMPUTE_DEVICE
39+
40+
with open("config.yaml", 'w') as stream:
41+
yaml.safe_dump(config_data, stream)
42+
43+
return COMPUTE_DEVICE
44+
45+
def is_nvidia_gpu():
46+
if torch.cuda.is_available():
47+
gpu_name = torch.cuda.get_device_name(0)
48+
return "nvidia" in gpu_name.lower()
49+
return False
50+
51+
52+
class DocQA_GUI:
53+
def __init__(self, root):
54+
self.root = root
55+
56+
main_pane = tk.PanedWindow(root, orient=tk.HORIZONTAL, bg="#202123")
57+
main_pane.pack(fill=tk.BOTH, expand=1)
58+
59+
# LEFT FRAME
60+
left_frame = tk.Frame(main_pane, bg="#202123", **FRAME_STYLE)
61+
62+
# 1. TABLE
63+
create_table(left_frame)
64+
65+
# 2. PRO TIP
66+
create_pro_tip(left_frame)
67+
68+
# 3. METRICS
69+
self.gpu_info_label = tk.Label(
70+
left_frame, **METRICS_STYLE, fg='#00c600')
71+
self.gpu_info_label.pack(pady=0, padx=1)
72+
73+
self.vram_info_label = tk.Label(
74+
left_frame, **METRICS_STYLE, fg='violet red')
75+
self.vram_info_label.pack(pady=0, padx=1)
76+
77+
self.ram_usage_label = tk.Label(
78+
left_frame, **METRICS_STYLE, fg='sky blue')
79+
self.ram_usage_label.pack(pady=0, padx=1)
80+
81+
self.ram_used_label = tk.Label(
82+
left_frame, **METRICS_STYLE, fg='medium purple')
83+
self.ram_used_label.pack(pady=0, padx=1)
84+
85+
self.cpu_usage_label = tk.Label(
86+
left_frame, **METRICS_STYLE, fg='gold')
87+
self.cpu_usage_label.pack(pady=0, padx=1)
88+
89+
compute_device = determine_compute_device()
90+
os_name = platform.system().lower()
91+
92+
if compute_device != "mps" and os_name == "windows" and is_nvidia_gpu():
93+
self.cuda_logic = GPU_Monitor(
94+
self.vram_info_label, self.gpu_info_label, self.root)
95+
self.system_monitor = SystemMonitor(
96+
self.cpu_usage_label, self.ram_used_label, self.ram_usage_label, self.root)
97+
else:
98+
self.cuda_logic = None
99+
self.system_monitor = None
100+
101+
# 4. BUTTONS
102+
self.download_embedding_model_button = tk.Button(
103+
left_frame, text="Download Embedding Model", **BUTTON_STYLE)
104+
self.download_embedding_model_button.pack(pady=2)
105+
106+
self.select_embedding_model_button = tk.Button(
107+
left_frame, text="Select Embedding Model Directory", **BUTTON_STYLE)
108+
self.select_embedding_model_button.pack(pady=2)
109+
110+
self.choose_documents_button = tk.Button(
111+
left_frame, text="Choose Documents for Database", **BUTTON_STYLE)
112+
self.choose_documents_button.pack(pady=2)
113+
114+
self.create_chromadb_button = tk.Button(
115+
left_frame, text="Create Vector Database", **BUTTON_STYLE)
116+
self.create_chromadb_button.pack(pady=2)
117+
main_pane.add(left_frame)
118+
119+
# RIGHT FRAME
120+
right_frame = tk.Frame(main_pane, bg="#202123", **FRAME_STYLE)
121+
main_pane.add(right_frame)
122+
123+
# For displaying the answer
124+
middle_frame = tk.Frame(right_frame, bg="#1e263c", **FRAME_STYLE)
125+
middle_frame.pack(pady=5, fill=tk.BOTH, expand=1)
126+
127+
self.read_only_text = tk.Text(
128+
middle_frame, wrap=tk.WORD, state=tk.DISABLED, bg="#1e263c", fg="light gray")
129+
self.read_only_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=1)
130+
self.read_only_text.configure(font=("Segoe UI Historic", 12))
131+
132+
scroll2 = tk.Scrollbar(
133+
middle_frame, command=self.read_only_text.yview)
134+
scroll2.pack(side=tk.RIGHT, fill=tk.Y)
135+
self.read_only_text.config(yscrollcommand=scroll2.set)
136+
137+
# For inputting the question
138+
bottom_frame = tk.Frame(right_frame)
139+
bottom_frame.pack(pady=5, fill=tk.BOTH, expand=1)
140+
141+
self.text_input = tk.Text(bottom_frame, wrap=tk.WORD, height=5, bg="#2a2b2e", fg="light gray")
142+
self.text_input.pack(side=tk.LEFT, fill=tk.BOTH, expand=1)
143+
self.text_input.configure(font=("Segoe UI Historic", 13))
144+
145+
scroll1 = tk.Scrollbar(bottom_frame, command=self.text_input.yview)
146+
scroll1.pack(side=tk.RIGHT, fill=tk.Y)
147+
self.text_input.config(yscrollcommand=scroll1.set)
148+
149+
self.submit_query_button = tk.Button(
150+
right_frame, text="Submit Question", width=29, bg="#323842", fg="light gray", font=("Segoe UI Historic", 10))
151+
self.submit_query_button.pack(pady=5, side=tk.TOP)
152+
153+
def center_window(self, root):
154+
root.withdraw()
155+
root.update_idletasks()
156+
157+
width = root.winfo_width()
158+
height = root.winfo_height()
159+
x = (root.winfo_screenwidth() // 2) - (width // 2)
160+
y = (root.winfo_screenheight() // 2) - (height // 2)
161+
162+
root.geometry('{}x{}+{}+{}'.format(width, height, x, y))
163+
root.deiconify()
164+
165+
def stop_and_exit(self):
166+
if self.cuda_logic:
167+
self.cuda_logic.stop_and_exit_gpu_monitor()
168+
if self.system_monitor:
169+
self.system_monitor.stop_and_exit_system_monitor()
170+
self.root.quit()
171+
self.root.destroy()
172+
173+
def start_up(self):
174+
self.center_window(self.root)
175+
self.root.protocol("WM_DELETE_WINDOW", self.stop_and_exit)
176+
self.root.mainloop()
177+
178+
if __name__ == "__main__":
179+
from gui_logic import DocQA_Logic
180+
root = tk.Tk()
181+
root.title("LM Studio ChromaDB Plugin - www.chintellalaw.com")
182+
root.geometry("850x910")
183+
root.minsize(850, 910)
184+
185+
app = DocQA_GUI(root)
186+
logic = DocQA_Logic(app)
187+
app.start_up()

0 commit comments

Comments
 (0)