forked from opea-project/GenAIComps
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathredis.py
More file actions
657 lines (561 loc) · 25.5 KB
/
redis.py
File metadata and controls
657 lines (561 loc) · 25.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# for test
import asyncio
import json
import os
from pathlib import Path
from typing import List, Optional, Union
import redis
from fastapi import Body, File, Form, HTTPException, UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Redis
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import HTMLHeaderTextSplitter
from redis import asyncio as aioredis
from redis.commands.search.field import TextField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType
from comps.dataprep.src.utils import (
create_upload_folder,
document_loader,
encode_filename,
format_search_results,
get_separators,
get_tables_result,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
logger = CustomLogger("redis_dataprep")
logflag = os.getenv("LOGFLAG", False)
upload_folder = "./uploaded_files/"
# Embedding model
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
# TEI Embedding endpoints
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
# Huggingface API token for TEI embedding endpoint
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
# Vector Index Configuration
INDEX_NAME = os.getenv("INDEX_NAME", "rag_redis")
KEY_INDEX_NAME = os.getenv("KEY_INDEX_NAME", "file-keys")
TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600))
SEARCH_BATCH_SIZE = int(os.getenv("SEARCH_BATCH_SIZE", 10))
# Redis Connection Information
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))
def get_boolean_env_var(var_name, default_value=False):
"""Retrieve the boolean value of an environment variable.
Args:
var_name (str): The name of the environment variable to retrieve.
default_value (bool): The default value to return if the variable
is not found.
Returns:
bool: The value of the environment variable, interpreted as a boolean.
"""
true_values = {"true", "1", "t", "y", "yes"}
false_values = {"false", "0", "f", "n", "no"}
# Retrieve the environment variable's value
value = os.getenv(var_name, "").lower()
# Decide the boolean value based on the content of the string
if value in true_values:
return True
elif value in false_values:
return False
else:
return default_value
def format_redis_conn_from_env():
redis_url = os.getenv("REDIS_URL", None)
if redis_url:
return redis_url
else:
using_ssl = get_boolean_env_var("REDIS_SSL", False)
start = "rediss://" if using_ssl else "redis://"
# if using RBAC
password = os.getenv("REDIS_PASSWORD", None)
username = os.getenv("REDIS_USERNAME", "default")
if password is not None:
start += f"{username}:{password}@"
return start + f"{REDIS_HOST}:{REDIS_PORT}"
REDIS_URL = format_redis_conn_from_env()
redis_pool = redis.ConnectionPool.from_url(REDIS_URL)
async def check_index_existance(client):
if logflag:
logger.info(f"[ check index existence ] checking {client}")
try:
results = await client.search("*")
if logflag:
logger.info(f"[ check index existence ] index of client exists: {client}")
return results
except Exception as e:
if logflag:
logger.info(f"[ check index existence ] index does not exist: {e}")
return None
async def create_index(client, index_name: str = KEY_INDEX_NAME):
if logflag:
logger.info(f"[ create index ] creating index {index_name}")
try:
definition = IndexDefinition(index_type=IndexType.HASH, prefix=["file:"])
await client.create_index((TextField("file_name"), TextField("key_ids")), definition=definition)
if logflag:
logger.info(f"[ create index ] index {index_name} successfully created")
except Exception as e:
if logflag:
logger.info(f"[ create index ] fail to create index {index_name}: {e}")
return False
return True
async def store_by_id(client, key, value):
if logflag:
logger.info(f"[ store by id ] storing ids of {key}")
try:
await client.add_document(doc_id="file:" + key, file_name=key, key_ids=value)
if logflag:
logger.info(f"[ store by id ] store document success. id: file:{key}")
except Exception as e:
if logflag:
logger.info(f"[ store by id ] fail to store document file:{key}: {e}")
return False
return True
async def search_by_id(client, doc_id):
if logflag:
logger.info(f"[ search by id ] searching docs of {doc_id}")
try:
results = await client.load_document(doc_id)
if logflag:
logger.info(f"[ search by id ] search success of {doc_id}: {results}")
return results
except Exception as e:
if logflag:
logger.info(f"[ search by id ] fail to search docs of {doc_id}: {e}")
return None
def drop_index(index_name, redis_url=REDIS_URL):
if logflag:
logger.info(f"[ drop index ] dropping index {index_name}")
try:
assert Redis.drop_index(index_name=index_name, delete_documents=True, redis_url=redis_url)
if logflag:
logger.info(f"[ drop index ] index {index_name} deleted")
except Exception as e:
if logflag:
logger.info(f"[ drop index ] index {index_name} delete failed: {e}")
return False
return True
async def delete_by_id(client, id):
try:
res = await client.delete_document(id)
assert res
if logflag:
logger.info(f"[ delete by id ] delete id success: {id}")
except Exception as e:
if logflag:
logger.info(f"[ delete by id ] fail to delete ids {id}: {e}")
return False
return True
async def ingest_chunks_to_redis(file_name: str, chunks: List, embedder, index_name: str):
if logflag:
logger.info(f"[ redis ingest chunks ] file name: {file_name}")
# Batch size
batch_size = 32
num_chunks = len(chunks)
# if data will be saved to a different index name than the default one
ingest_index_name = index_name if index_name else INDEX_NAME
file_ids = []
for i in range(0, num_chunks, batch_size):
if logflag:
logger.info(f"[ redis ingest chunks ] Current batch: {i}")
batch_chunks = chunks[i : i + batch_size]
batch_texts = batch_chunks
_, keys = await asyncio.to_thread(
Redis.from_texts_return_keys,
texts=batch_texts,
embedding=embedder,
index_name=ingest_index_name,
redis_url=REDIS_URL,
)
if logflag:
logger.info(f"[ redis ingest chunks ] keys: {keys}")
file_ids.extend(keys)
if logflag:
logger.info(f"[ redis ingest chunks ] Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
# store file_ids into index file-keys
r = await aioredis.from_url(REDIS_URL)
client = r.ft(KEY_INDEX_NAME)
if not await check_index_existance(client):
await create_index(client)
try:
await store_by_id(client, key=encode_filename(ingest_index_name) + "_" + file_name, value="#".join(file_ids))
except Exception as e:
if logflag:
logger.info(f"[ redis ingest chunks ] {e}. Fail to store chunks of file {file_name}.")
raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.")
return True
async def ingest_data_to_redis(doc_path: DocPath, embedder, index_name):
"""Ingest document to Redis."""
path = doc_path.path
if logflag:
logger.info(f"[ redis ingest data ] Parsing document {path}.")
if path.endswith(".html"):
headers_to_split_on = [
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size,
chunk_overlap=doc_path.chunk_overlap,
add_start_index=True,
separators=get_separators(),
)
content = await document_loader(path)
if logflag:
logger.info("[ redis ingest data ] file content loaded")
structured_types = [".xlsx", ".csv", ".json", "jsonl"]
_, ext = os.path.splitext(path)
if ext in structured_types:
chunks = content
else:
chunks = await asyncio.to_thread(text_splitter.split_text, content)
### Specially processing for the table content in PDFs
if doc_path.process_table and path.endswith(".pdf"):
table_chunks = get_tables_result(path, doc_path.table_strategy)
chunks = chunks + table_chunks
if logflag:
logger.info(f"[ redis ingest data ] Done preprocessing. Created {len(chunks)} chunks of the given file.")
file_name = doc_path.path.split("/")[-1]
return await ingest_chunks_to_redis(file_name, chunks, embedder, index_name)
@OpeaComponentRegistry.register("OPEA_DATAPREP_REDIS")
class OpeaRedisDataprep(OpeaComponent):
"""A specialized dataprep component derived from OpeaComponent for redis dataprep services.
Attributes:
client (redis.Redis): An instance of the redis client for vector database operations.
"""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.DATAPREP.name.lower(), description, config)
self.client = redis.Redis(connection_pool=redis_pool)
self.data_index_client, self.key_index_client = asyncio.run(self._initialize_client())
self.embedder = asyncio.run(self._initialize_embedder())
health_status = asyncio.run(self.check_health())
if not health_status:
logger.error("OpeaRedisDataprep health check failed.")
async def _initialize_client(self) -> redis.Redis:
if logflag:
logger.info("[ initialize client ] initializing redis client...")
"""Initializes the redis client."""
try:
client = await aioredis.from_url(REDIS_URL)
data_index_client = client.ft(INDEX_NAME)
key_index_client = client.ft(KEY_INDEX_NAME)
return data_index_client, key_index_client
except Exception as e:
logger.error(f"fail to initialize redis client: {e}")
return None
async def _initialize_embedder(self):
if TEI_EMBEDDING_ENDPOINT:
if not HUGGINGFACEHUB_API_TOKEN:
raise HTTPException(
status_code=400,
detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.",
)
import httpx
async with httpx.AsyncClient() as client:
response = await client.get(TEI_EMBEDDING_ENDPOINT + "/info")
if response.status_code != 200:
raise HTTPException(
status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
)
model_id = response.json()["model_id"]
# create embeddings using TEI endpoint service
embedder = HuggingFaceInferenceAPIEmbeddings(
api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
)
else:
# create embeddings using local embedding model
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
return embedder
async def check_health(self) -> bool:
"""Checks the health of the dataprep service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
if logflag:
logger.info("[ health check ] start to check health of redis")
try:
if self.client.ping():
if logflag:
logger.info("[ health check ] Successfully connected to Redis!")
return True
except redis.ConnectionError as e:
logger.info(f"[ health check ] Failed to connect to Redis: {e}")
return False
def invoke(self, *args, **kwargs):
pass
async def ingest_files(
self,
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
link_list: Optional[str] = Form(None),
chunk_size: int = Form(1500),
chunk_overlap: int = Form(100),
process_table: bool = Form(False),
table_strategy: str = Form("fast"),
ingest_from_graphDB: bool = Form(False),
index_name: str = Form(None),
):
"""Ingest files/links content into redis database.
Save in the format of vector[768].
Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful.
Args:
files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None).
link_list (str, optional): A list of links to be ingested. Defaults to Form(None).
chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500).
chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100).
process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False).
table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast").
index_name (str, optional): The name of the index where data will be ingested.
"""
if logflag:
logger.info(f"[ redis ingest ] files:{files}")
logger.info(f"[ redis ingest ] link_list:{link_list}")
if files:
if not isinstance(files, list):
files = [files]
uploaded_files = []
for file in files:
encode_file = encode_filename(file.filename)
index_name_id = encode_filename(INDEX_NAME if index_name is None else index_name)
doc_id = "file:" + index_name_id + "_" + encode_file
if logflag:
logger.info(f"[ redis ingest ] processing file {doc_id}")
# check whether the file already exists
key_ids = None
try:
result = await search_by_id(self.key_index_client, doc_id)
key_ids = result.key_ids
if logflag:
logger.info(f"[ redis ingest] File {file.filename} already exists.")
except Exception as e:
logger.info(f"[ redis ingest] File {file.filename} does not exist.")
if key_ids:
raise HTTPException(
status_code=400,
detail=f"Uploaded file {file.filename} already exists. Please change file name or index name.",
)
save_path = upload_folder + encode_file
await save_content_to_local_disk(save_path, file)
await ingest_data_to_redis(
DocPath(
path=save_path,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
process_table=process_table,
table_strategy=table_strategy,
),
self.embedder,
index_name,
)
uploaded_files.append(save_path)
if logflag:
logger.info(f"[ redis ingest] Successfully saved file {save_path}")
result = {"status": 200, "message": "Data preparation succeeded"}
if logflag:
logger.info(result)
return result
if link_list:
link_list = json.loads(link_list) # Parse JSON string to list
if not isinstance(link_list, list):
raise HTTPException(status_code=400, detail=f"Link_list {link_list} should be a list.")
for link in link_list:
encoded_link = encode_filename(link)
index_name_id = encode_filename(INDEX_NAME if index_name is None else index_name)
doc_id = "file:" + index_name_id + "_" + encoded_link + ".txt"
if logflag:
logger.info(f"[ redis ingest] processing link {doc_id}")
# check whether the link file already exists
key_ids = None
try:
result = await search_by_id(self.key_index_client, doc_id)
key_ids = result.key_ids
if logflag:
logger.info(f"[ redis ingest] Link {link} already exists.")
except Exception as e:
logger.info(f"[ redis ingest] Link {link} does not exist. Keep storing.")
if key_ids:
raise HTTPException(
status_code=400,
detail=f"Uploaded link {link} already exists. Please change another link or index_name.",
)
save_path = upload_folder + encoded_link + ".txt"
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
await ingest_data_to_redis(
DocPath(
path=save_path,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
process_table=process_table,
table_strategy=table_strategy,
),
self.embedder,
index_name,
)
if logflag:
logger.info(f"[ redis ingest] Successfully saved link list {link_list}")
return {"status": 200, "message": "Data preparation succeeded"}
raise HTTPException(status_code=400, detail="Must provide either a file or a string list.")
async def get_files(self):
"""Get file structure from redis database in the format of
{
"name": "File Name",
"id": "File Name",
"type": "File",
"parent": "",
}"""
if logflag:
logger.info("[ redis get ] start to get file structure")
offset = 0
file_list = []
# check index existence
res = await check_index_existance(self.key_index_client)
if not res:
if logflag:
logger.info(f"[ redis get ] index {KEY_INDEX_NAME} does not exist")
return file_list
while True:
response = self.client.execute_command(
"FT.SEARCH", KEY_INDEX_NAME, "*", "LIMIT", offset, offset + SEARCH_BATCH_SIZE
)
# no doc retrieved
if len(response) < 2:
break
file_list = format_search_results(response, file_list)
offset += SEARCH_BATCH_SIZE
# last batch
if (len(response) - 1) // 2 < SEARCH_BATCH_SIZE:
break
if logflag:
logger.info(f"[get] final file_list: {file_list}")
return file_list
async def delete_files(self, file_path: str = Body(..., embed=True), index_name: str = Body(None, embed=True)):
"""Delete file according to `file_path`.
`file_path`:
- specific file path (e.g. /path/to/file.txt)
- "all": delete all files uploaded
"""
if logflag:
logger.info(f"[ redis delete ] delete files: {file_path}")
# delete all uploaded files
if file_path == "all":
if logflag:
logger.info("[ redis delete ] delete all files")
# drop index KEY_INDEX_NAME
if await check_index_existance(self.key_index_client):
try:
assert drop_index(index_name=KEY_INDEX_NAME)
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. Fail to drop index {KEY_INDEX_NAME}.")
raise HTTPException(status_code=500, detail=f"Fail to drop index {KEY_INDEX_NAME}.")
else:
logger.info(f"[ redis delete ] Index {KEY_INDEX_NAME} does not exits.")
if len(self.get_list_of_indices()) > 0:
for i in self.get_list_of_indices():
try:
# drop index INDEX_NAME
assert drop_index(index_name=i)
logger.info(f"[ redis delete ] Index_name: {i} is deleted.")
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. Fail to drop index {i}.")
raise HTTPException(status_code=500, detail=f"Fail to drop index {i}.")
else:
if logflag:
logger.info("[ redis delete ] There is no index_name registered to redis db.")
# delete files on local disk
try:
remove_folder_with_ignore(upload_folder)
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. Fail to delete {upload_folder}.")
raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.")
if logflag:
logger.info("[ redis delete ] successfully delete all files.")
create_upload_folder(upload_folder)
if logflag:
logger.info({"status": True})
return {"status": True}
delete_path = Path(upload_folder + "/" + encode_filename(file_path))
if logflag:
logger.info(f"[ redis delete ] delete_path: {delete_path}")
# partially delete files
encode_file = encode_filename(file_path)
index_name = INDEX_NAME if index_name is None else index_name
index_name_id = encode_filename(index_name)
doc_id = "file:" + index_name_id + "_" + encode_file
logger.info(f"[ redis delete ] doc id: {doc_id}")
# determine whether this file exists in db KEY_INDEX_NAME
try:
result = await search_by_id(self.key_index_client, doc_id)
key_ids = result.key_ids
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}, File {file_path} does not exists.")
raise HTTPException(
status_code=404, detail=f"File not found in db {KEY_INDEX_NAME}. Please check file_path."
)
file_ids = key_ids.split("#")
# delete file keys id in db KEY_INDEX_NAME
try:
res = await delete_by_id(self.key_index_client, doc_id)
assert res
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. File {file_path} delete failed for db {KEY_INDEX_NAME}.")
raise HTTPException(status_code=500, detail=f"File {file_path} delete failed for key index.")
# delete file content in db index_name
for file_id in file_ids:
# determine whether this file exists in db index_name
try:
await search_by_id(self.data_index_client, file_id)
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. File {file_path} does not exists.")
raise HTTPException(
status_code=404, detail=f"File not found in db {index_name}. Please check file_path."
)
# delete file content
try:
res = await delete_by_id(self.data_index_client, file_id)
assert res
except Exception as e:
if logflag:
logger.info(f"[ redis delete ] {e}. File {file_path} delete failed for db {index_name}")
raise HTTPException(status_code=500, detail=f"File {file_path} delete failed for index.")
# local file does not exist (restarted docker container)
if not delete_path.exists():
if logflag:
logger.info(f"[ redis delete ] File {file_path} not saved locally.")
return {"status": True}
# delete local file
if delete_path.is_file():
# delete file on local disk
delete_path.unlink()
if logflag:
logger.info(f"[ redis delete ] File {file_path} deleted successfully.")
return {"status": True}
# delete folder
else:
if logflag:
logger.info(f"[ redis delete ] Delete folder {file_path} is not supported for now.")
raise HTTPException(status_code=404, detail=f"Delete folder {file_path} is not supported for now.")
def get_list_of_indices(self):
"""Retrieves a list of all indices from the Redis client.
Returns:
A list of index names as strings.
"""
# Execute the command to list all indices
indices = self.client.execute_command("FT._LIST")
# Decode each index name from bytes to string
indices_list = [item.decode("utf-8") for item in indices]
return indices_list