Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions src/load_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,35 +66,35 @@ def get_document_pk(doc: Document, pk_metadata_fields: Iterable[str]) -> str:
concatened_fields = "".join(fields)
return hashlib.md5(concatened_fields.encode()).hexdigest()

#Single CSV Load: Instead of loading the CSV twice, the function now loads it once with all necessary metadata columns (NECESSARY_COLS + metadata_columns).
#Efficient Merging: We directly filter out unnecessary metadata fields when creating the final list of documents. This eliminates the need for merging two separate sets of metadata.
#Improved Clarity: The function is now easier to follow and avoids redundant operations.

def load_csv_with_metadata(
path: str,
embed_columns: list[str] = [],
metadata_columns: List[str] = [],
) -> List[Document]:
"""Load CSV twice, once with specific metadata columns and once with all NECESSARY_COLS"""
"""Load CSV once, combine metadata columns and content, and create Documents."""

# Load the CSV once to get metadata columns
loader_metadata = CSVLoader(path, metadata_columns=metadata_columns)
# Load the CSV with all necessary metadata columns
loader_metadata = CSVLoader(path, metadata_columns=NECESSARY_COLS + metadata_columns)
docs_metadata: List[Document] = loader_metadata.load()

# Load the CSV again to get all NECESSARY_COLS as metadata
loader_necessary = CSVLoader(path, metadata_columns=NECESSARY_COLS)
docs_necessary: List[Document] = loader_necessary.load()

# Merge documents to ensure all necessary columns are included as metadata
# Only retain the necessary columns in the metadata, avoiding redundant fields
not_used_metadata_fields = {"row", "source"}
merged_docs = []
not_used_metadata_fields = ["row", "source"]
for doc_meta, doc_necessary in zip(docs_metadata, docs_necessary):
merged_metadata = {**doc_meta.metadata, **doc_necessary.metadata}
merged_metadata = {k: v for k, v in merged_metadata.items() if k not in not_used_metadata_fields}
merged_doc = Document(
page_content=doc_meta.page_content, metadata=merged_metadata
)

for doc_meta in docs_metadata:
# Filtering out the unnecessary metadata fields directly when creating the merged document
merged_metadata = {k: v for k, v in doc_meta.metadata.items() if k not in not_used_metadata_fields}
merged_doc = Document(page_content=doc_meta.page_content, metadata=merged_metadata)
merged_docs.append(merged_doc)

return merged_docs



def load_csv_and_generate_embeddings(
path, session, cleardb=False, embed_columns: Optional[list[str]] = None
):
Expand Down