11import gspread
2+ import logging
23
34from dialog_lib .db .models import CompanyContent
45from dialog_lib .embeddings .generate import generate_embedding
1112from typing import Any , Dict , Iterator , List , Optional , Sequence , Union
1213
1314
15+ logger = logging .getLogger (__name__ )
16+
1417class GoogleSheetsLoader (BaseLoader ):
1518 def __init__ (self , credentials_path : Union [str , Path ], spreadsheet_url : str , sheet_name : str ):
1619 self .sheet_name = sheet_name
@@ -59,15 +62,19 @@ def load_google_sheets(
5962 values = line .split (": " )
6063 content [values [0 ]] = values [1 ]
6164
62- company_content = CompanyContent (
63- category = "csv" ,
64- subcategory = "csv-content" ,
65- question = content ["question" ],
66- content = content ["content" ],
67- dataset = company_id ,
68- embedding = generate_embedding (csv_content .page_content , embeddings_model_instance )
69- )
70- dbsession .add (company_content )
65+ if not dbsession .query (CompanyContent ).filter (
66+ CompanyContent .question == content ["question" ], CompanyContent .content == content ["content" ]
67+ ).first ():
68+ company_content = CompanyContent (
69+ category = "csv" ,
70+ subcategory = "csv-content" ,
71+ question = content ["question" ],
72+ content = content ["content" ],
73+ dataset = company_id ,
74+ embedding = generate_embedding (csv_content .page_content , embeddings_model_instance )
75+ )
76+ dbsession .add (company_content )
77+ else :
78+ logger .warning (f"Question: { content ['question' ]} already exists in the database. Skipping." )
7179
72- dbsession .commit ()
73- return company_content
80+ dbsession .commit ()
0 commit comments