diff --git a/libs/astradb/langchain_astradb/utils/encoders.py b/libs/astradb/langchain_astradb/utils/encoders.py index 5e77c1b..d791e98 100644 --- a/libs/astradb/langchain_astradb/utils/encoders.py +++ b/libs/astradb/langchain_astradb/utils/encoders.py @@ -28,7 +28,7 @@ def _default_encode_filter(filter_dict: dict[str, Any]) -> dict[str, Any]: return metadata_filter -class VSDocumentEncoder(ABC): +class _AstraDBVectorStoreDocumentEncoder(ABC): """A document encoder for the Astra DB vector store. The document encoder contains the information for consistent interaction @@ -36,8 +36,8 @@ class VSDocumentEncoder(ABC): Implementations of this class must: - define how to encode/decode documents consistently to and from - Astra DB collections. The two operations must combine to the identity - on both sides. + Astra DB collections. The two operations must, so to speak, combine + to the identity on both sides (except for the quirks of their signatures). - provide the adequate projection dictionaries for running find operations on Astra DB, with and without the field containing the vector. - encode IDs to the `_id` field on Astra DB. @@ -98,7 +98,7 @@ def encode_filter(self, filter_dict: dict[str, Any]) -> dict[str, Any]: """ -class DefaultVSDocumentEncoder(VSDocumentEncoder): +class _DefaultVSDocumentEncoder(_AstraDBVectorStoreDocumentEncoder): """Encoder for the default vector store usage with client-side embeddings. This encoder expresses how document are stored for collections created @@ -148,7 +148,7 @@ def encode_filter(self, filter_dict: dict[str, Any]) -> dict[str, Any]: return _default_encode_filter(filter_dict) -class DefaultVectorizeVSDocumentEncoder(VSDocumentEncoder): +class _DefaultVectorizeVSDocumentEncoder(_AstraDBVectorStoreDocumentEncoder): """Encoder for the default vector store usage with server-side embeddings. This encoder expresses how document are stored for collections created diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py index 90ceab5..bb2d5ad 100644 --- a/libs/astradb/langchain_astradb/vectorstores.py +++ b/libs/astradb/langchain_astradb/vectorstores.py @@ -33,9 +33,9 @@ _AstraDBCollectionEnvironment, ) from langchain_astradb.utils.encoders import ( - DefaultVectorizeVSDocumentEncoder, - DefaultVSDocumentEncoder, - VSDocumentEncoder, + _AstraDBVectorStoreDocumentEncoder, + _DefaultVectorizeVSDocumentEncoder, + _DefaultVSDocumentEncoder, ) from langchain_astradb.utils.mmr import maximal_marginal_relevance @@ -400,11 +400,11 @@ def __init__( self.environment = environment self.namespace = namespace self.collection_vector_service_options = collection_vector_service_options - self.document_encoder: VSDocumentEncoder + self.document_encoder: _AstraDBVectorStoreDocumentEncoder if self.collection_vector_service_options is not None: - self.document_encoder = DefaultVectorizeVSDocumentEncoder() + self.document_encoder = _DefaultVectorizeVSDocumentEncoder() else: - self.document_encoder = DefaultVSDocumentEncoder() + self.document_encoder = _DefaultVSDocumentEncoder() self.collection_embedding_api_key = collection_embedding_api_key # Concurrency settings self.batch_size: int | None = batch_size or DEFAULT_DOCUMENT_CHUNK_SIZE @@ -931,129 +931,56 @@ async def _replace_document( raise ValueError(msg) return inserted_ids - def similarity_search_with_score_id_by_vector( + @override + def similarity_search( self, - embedding: list[float], + query: str, k: int = 4, - filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float, str]]: - """Return docs most similar to embedding vector with score and id. + filter: dict[str, Any] | None = None, + **kwargs: Any, + ) -> list[Document]: + """Return docs most similar to query. Args: - embedding: Embedding to look up documents similar to. + query: Query to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: - The list of (Document, score, id), the most similar to the query vector. + The list of Documents most similar to the query. """ - self.astra_env.ensure_db_setup() - metadata_parameter = self._filter_to_metadata(filter) - hits = list( - self.astra_env.collection.find( - filter=metadata_parameter, - projection=self.document_encoder.base_projection, - limit=k, - include_similarity=True, - sort={"$vector": embedding}, - ) - ) return [ - ( - self.document_encoder.decode(hit), - hit["$similarity"], - hit["_id"], + doc + for (doc, _, _) in self.similarity_search_with_score_id( + query=query, + k=k, + filter=filter, ) - for hit in hits ] - async def asimilarity_search_with_score_id_by_vector( + @override + def similarity_search_with_score( self, - embedding: list[float], + query: str, k: int = 4, - filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float, str]]: - """Return docs most similar to embedding vector with score and id. + filter: dict[str, Any] | None = None, + ) -> list[tuple[Document, float]]: + """Return docs most similar to query with score. Args: - embedding: Embedding to look up documents similar to. + query: Query to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: - The list of (Document, score, id), the most similar to the query vector. - """ - await self.astra_env.aensure_db_setup() - metadata_parameter = self._filter_to_metadata(filter) - return [ - ( - self.document_encoder.decode(hit), - hit["$similarity"], - hit["_id"], - ) - async for hit in self.astra_env.async_collection.find( - filter=metadata_parameter, - projection=self.document_encoder.base_projection, - limit=k, - include_similarity=True, - sort={"$vector": embedding}, - ) - ] - - def _similarity_search_with_score_id_with_vectorize( - self, - query: str, - k: int = 4, - filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float, str]]: - """Return docs most similar to the query with score and id using $vectorize. - - This is only available when using server-side embeddings. - """ - self.astra_env.ensure_db_setup() - metadata_parameter = self._filter_to_metadata(filter) - hits = list( - self.astra_env.collection.find( - filter=metadata_parameter, - projection=self.document_encoder.base_projection, - limit=k, - include_similarity=True, - sort={"$vectorize": query}, - ) - ) - return [ - ( - self.document_encoder.decode(hit), - hit["$similarity"], - hit["_id"], - ) - for hit in hits - ] - - async def _asimilarity_search_with_score_id_with_vectorize( - self, - query: str, - k: int = 4, - filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float, str]]: - """Return docs most similar to the query with score and id using $vectorize. - - This is only available when using server-side embeddings. + The list of (Document, score), the most similar to the query vector. """ - await self.astra_env.aensure_db_setup() - metadata_parameter = self._filter_to_metadata(filter) return [ - ( - self.document_encoder.decode(hit), - hit["$similarity"], - hit["_id"], - ) - async for hit in self.astra_env.async_collection.find( - filter=metadata_parameter, - projection=self.document_encoder.base_projection, - limit=k, - include_similarity=True, - sort={"$vectorize": query}, + (doc, score) + for (doc, score, _) in self.similarity_search_with_score_id( + query=query, + k=k, + filter=filter, ) ] @@ -1074,8 +1001,9 @@ def similarity_search_with_score_id( The list of (Document, score, id), the most similar to the query. """ if self.document_encoder.server_side_embeddings: - return self._similarity_search_with_score_id_with_vectorize( - query=query, + sort = {"$vectorize": query} + return self._similarity_search_with_score_id_by_sort( + sort=sort, k=k, filter=filter, ) @@ -1087,35 +1015,32 @@ def similarity_search_with_score_id( filter=filter, ) - async def asimilarity_search_with_score_id( + @override + def similarity_search_by_vector( self, - query: str, + embedding: list[float], k: int = 4, - filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float, str]]: - """Return docs most similar to the query with score and id. + filter: dict[str, Any] | None = None, + **kwargs: Any, + ) -> list[Document]: + """Return docs most similar to embedding vector. Args: - query: Query to look up documents similar to. + embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: - The list of (Document, score, id), the most similar to the query. + The list of Documents most similar to the query vector. """ - if self.document_encoder.server_side_embeddings: - return await self._asimilarity_search_with_score_id_with_vectorize( - query=query, + return [ + doc + for (doc, _, _) in self.similarity_search_with_score_id_by_vector( + embedding=embedding, k=k, filter=filter, ) - - embedding_vector = await self._get_safe_embedding().aembed_query(query) - return await self.asimilarity_search_with_score_id_by_vector( - embedding=embedding_vector, - k=k, - filter=filter, - ) + ] def similarity_search_with_score_by_vector( self, @@ -1135,20 +1060,20 @@ def similarity_search_with_score_by_vector( """ return [ (doc, score) - for (doc, score, doc_id) in self.similarity_search_with_score_id_by_vector( + for (doc, score, _) in self.similarity_search_with_score_id_by_vector( embedding=embedding, k=k, filter=filter, ) ] - async def asimilarity_search_with_score_by_vector( + def similarity_search_with_score_id_by_vector( self, embedding: list[float], k: int = 4, filter: dict[str, Any] | None = None, # noqa: A002 - ) -> list[tuple[Document, float]]: - """Return docs most similar to embedding vector with score. + ) -> list[tuple[Document, float, str]]: + """Return docs most similar to embedding vector with score and id. Args: embedding: Embedding to look up documents similar to. @@ -1156,23 +1081,48 @@ async def asimilarity_search_with_score_by_vector( filter: Filter on the metadata to apply. Returns: - The list of (Document, score), the most similar to the query vector. + The list of (Document, score, id), the most similar to the query vector. """ + if self.document_encoder.server_side_embeddings: + msg = ( + "Searching by vector on a Vector Store that uses server-side " + "embeddings is not allowed." + ) + raise ValueError(msg) + sort = {"$vector": embedding} + return self._similarity_search_with_score_id_by_sort( + sort=sort, + k=k, + filter=filter, + ) + + def _similarity_search_with_score_id_by_sort( + self, + sort: dict[str, Any], + k: int = 4, + filter: dict[str, Any] | None = None, # noqa: A002 + ) -> list[tuple[Document, float, str]]: + """Run ANN search with a provided sort clause.""" + self.astra_env.ensure_db_setup() + metadata_parameter = self._filter_to_metadata(filter) + hits_ite = self.astra_env.collection.find( + filter=metadata_parameter, + projection=self.document_encoder.base_projection, + limit=k, + include_similarity=True, + sort=sort, + ) return [ - (doc, score) - for ( - doc, - score, - doc_id, - ) in await self.asimilarity_search_with_score_id_by_vector( - embedding=embedding, - k=k, - filter=filter, + ( + self.document_encoder.decode(hit), + hit["$similarity"], + hit["_id"], ) + for hit in hits_ite ] @override - def similarity_search( + async def asimilarity_search( self, query: str, k: int = 4, @@ -1189,32 +1139,23 @@ def similarity_search( Returns: The list of Documents most similar to the query. """ - if self.document_encoder.server_side_embeddings: - return [ - doc - for (doc, _, _) in self._similarity_search_with_score_id_with_vectorize( - query, - k, - filter=filter, - ) - ] - - embedding_vector = self._get_safe_embedding().embed_query(query) - return self.similarity_search_by_vector( - embedding_vector, - k, - filter=filter, - ) + return [ + doc + for (doc, _, _) in await self.asimilarity_search_with_score_id( + query=query, + k=k, + filter=filter, + ) + ] @override - async def asimilarity_search( + async def asimilarity_search_with_score( self, query: str, k: int = 4, filter: dict[str, Any] | None = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs most similar to query. + ) -> list[tuple[Document, float]]: + """Return docs most similar to query with score. Args: query: Query to look up documents similar to. @@ -1222,55 +1163,47 @@ async def asimilarity_search( filter: Filter on the metadata to apply. Returns: - The list of Documents most similar to the query. + The list of (Document, score), the most similar to the query vector. """ - if self.document_encoder.server_side_embeddings: - return [ - doc - for ( - doc, - _, - _, - ) in await self._asimilarity_search_with_score_id_with_vectorize( - query, - k, - filter=filter, - ) - ] - - embedding_vector = await self._get_safe_embedding().aembed_query(query) - return await self.asimilarity_search_by_vector( - embedding_vector, - k, - filter=filter, - ) + return [ + (doc, score) + for (doc, score, _) in await self.asimilarity_search_with_score_id( + query=query, + k=k, + filter=filter, + ) + ] - @override - def similarity_search_by_vector( + async def asimilarity_search_with_score_id( self, - embedding: list[float], + query: str, k: int = 4, - filter: dict[str, Any] | None = None, - **kwargs: Any, - ) -> list[Document]: - """Return docs most similar to embedding vector. + filter: dict[str, Any] | None = None, # noqa: A002 + ) -> list[tuple[Document, float, str]]: + """Return docs most similar to the query with score and id. Args: - embedding: Embedding to look up documents similar to. + query: Query to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: - The list of Documents most similar to the query vector. + The list of (Document, score, id), the most similar to the query. """ - return [ - doc - for doc, _ in self.similarity_search_with_score_by_vector( - embedding, - k, + if self.document_encoder.server_side_embeddings: + sort = {"$vectorize": query} + return await self._asimilarity_search_with_score_id_by_sort( + sort=sort, + k=k, filter=filter, ) - ] + + embedding_vector = await self._get_safe_embedding().aembed_query(query) + return await self.asimilarity_search_with_score_id_by_vector( + embedding=embedding_vector, + k=k, + filter=filter, + ) @override async def asimilarity_search_by_vector( @@ -1292,89 +1225,91 @@ async def asimilarity_search_by_vector( """ return [ doc - for doc, _ in await self.asimilarity_search_with_score_by_vector( - embedding, - k, + for (doc, _, _) in await self.asimilarity_search_with_score_id_by_vector( + embedding=embedding, + k=k, filter=filter, ) ] - @override - def similarity_search_with_score( + async def asimilarity_search_with_score_by_vector( self, - query: str, + embedding: list[float], k: int = 4, - filter: dict[str, Any] | None = None, + filter: dict[str, Any] | None = None, # noqa: A002 ) -> list[tuple[Document, float]]: - """Return docs most similar to query with score. + """Return docs most similar to embedding vector with score. Args: - query: Query to look up documents similar to. + embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: The list of (Document, score), the most similar to the query vector. """ - if self.document_encoder.server_side_embeddings: - return [ - (doc, score) - for ( - doc, - score, - doc_id, - ) in self._similarity_search_with_score_id_with_vectorize( - query=query, - k=k, - filter=filter, - ) - ] - - embedding_vector = self._get_safe_embedding().embed_query(query) - return self.similarity_search_with_score_by_vector( - embedding_vector, - k, - filter=filter, - ) + return [ + (doc, scr) + for (doc, scr, _) in await self.asimilarity_search_with_score_id_by_vector( + embedding=embedding, + k=k, + filter=filter, + ) + ] - @override - async def asimilarity_search_with_score( + async def asimilarity_search_with_score_id_by_vector( self, - query: str, + embedding: list[float], k: int = 4, - filter: dict[str, Any] | None = None, - ) -> list[tuple[Document, float]]: - """Return docs most similar to query with score. + filter: dict[str, Any] | None = None, # noqa: A002 + ) -> list[tuple[Document, float, str]]: + """Return docs most similar to embedding vector with score and id. Args: - query: Query to look up documents similar to. + embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: Filter on the metadata to apply. Returns: - The list of (Document, score), the most similar to the query vector. + The list of (Document, score, id), the most similar to the query vector. """ if self.document_encoder.server_side_embeddings: - return [ - (doc, score) - for ( - doc, - score, - doc_id, - ) in await self._asimilarity_search_with_score_id_with_vectorize( - query=query, - k=k, - filter=filter, - ) - ] - - embedding_vector = await self._get_safe_embedding().aembed_query(query) - return await self.asimilarity_search_with_score_by_vector( - embedding_vector, - k, + msg = ( + "Searching by vector on a Vector Store that uses server-side " + "embeddings is not allowed." + ) + raise ValueError(msg) + sort = {"$vector": embedding} + return await self._asimilarity_search_with_score_id_by_sort( + sort=sort, + k=k, filter=filter, ) + async def _asimilarity_search_with_score_id_by_sort( + self, + sort: dict[str, Any], + k: int = 4, + filter: dict[str, Any] | None = None, # noqa: A002 + ) -> list[tuple[Document, float, str]]: + """Run ANN search with a provided sort clause.""" + await self.astra_env.aensure_db_setup() + metadata_parameter = self._filter_to_metadata(filter) + return [ + ( + self.document_encoder.decode(hit), + hit["$similarity"], + hit["_id"], + ) + async for hit in self.astra_env.async_collection.find( + filter=metadata_parameter, + projection=self.document_encoder.base_projection, + limit=k, + include_similarity=True, + sort=sort, + ) + ] + def _run_mmr_query_by_sort( self, sort: dict[str, Any], diff --git a/libs/astradb/tests/unit_tests/test_vs_doc_encoders.py b/libs/astradb/tests/unit_tests/test_vs_doc_encoders.py new file mode 100644 index 0000000..07591b1 --- /dev/null +++ b/libs/astradb/tests/unit_tests/test_vs_doc_encoders.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import pytest +from langchain_core.documents import Document + +from langchain_astradb.utils.encoders import ( + _DefaultVectorizeVSDocumentEncoder, + _DefaultVSDocumentEncoder, +) + +METADATA = {"m1": 1, "m2": "two"} +CONTENT = "The content" +VECTOR: list[float] = [1, 2, 3] +DOCUMENT_ID = "the_id" +LC_DOCUMENT = Document(page_content=CONTENT, metadata=METADATA) +ASTRA_DEFAULT_DOCUMENT_NOVECTORIZE = { + "_id": DOCUMENT_ID, + "content": CONTENT, + "metadata": METADATA, + "$vector": VECTOR, +} +ASTRA_DEFAULT_DOCUMENT_VECTORIZE = { + "_id": DOCUMENT_ID, + "$vectorize": CONTENT, + "metadata": METADATA, +} +LC_FILTER = {"a0": 0, "$or": [{"b1": 1}, {"b2": 2}]} +ASTRA_DEFAULT_FILTER = { + "metadata.a0": 0, + "$or": [{"metadata.b1": 1}, {"metadata.b2": 2}], +} + + +class TestVSDocEncoders: + def test_default_novectorize_encoding(self) -> None: + """Test encoding for default, no-vectorize.""" + encoder = _DefaultVSDocumentEncoder() + encoded_doc = encoder.encode( + content=CONTENT, + document_id=DOCUMENT_ID, + vector=VECTOR, + metadata=METADATA, + ) + assert encoded_doc == ASTRA_DEFAULT_DOCUMENT_NOVECTORIZE + + def test_default_novectorize_vector_required(self) -> None: + """Test vector is required for default encoding, no-vectorize.""" + encoder = _DefaultVSDocumentEncoder() + with pytest.raises(ValueError): + encoder.encode( + content=CONTENT, + document_id=DOCUMENT_ID, + vector=None, + metadata=METADATA, + ) + + def test_default_novectorize_decoding(self) -> None: + """Test decoding for default, no-vectorize.""" + encoder = _DefaultVSDocumentEncoder() + decoded_doc = encoder.decode(ASTRA_DEFAULT_DOCUMENT_NOVECTORIZE) + assert decoded_doc == LC_DOCUMENT + + def test_default_novectorize_filtering(self) -> None: + """Test filter-encoding for default, no-vectorize.""" + encoder = _DefaultVSDocumentEncoder() + encoded_flt = encoder.encode_filter(LC_FILTER) + assert encoded_flt == ASTRA_DEFAULT_FILTER + + def test_default_vectorize_encoding(self) -> None: + """Test encoding for default, vectorize.""" + encoder = _DefaultVectorizeVSDocumentEncoder() + encoded_doc = encoder.encode( + content=CONTENT, + document_id=DOCUMENT_ID, + vector=None, + metadata=METADATA, + ) + assert encoded_doc == ASTRA_DEFAULT_DOCUMENT_VECTORIZE + + def test_default_vectorize_vector_forbidden(self) -> None: + """Test vector is not allowed for default encoding, vectorize.""" + encoder = _DefaultVectorizeVSDocumentEncoder() + with pytest.raises(ValueError): + encoder.encode( + content=CONTENT, + document_id=DOCUMENT_ID, + vector=VECTOR, + metadata=METADATA, + ) + + def test_default_vectorize_decoding(self) -> None: + """Test decoding for default, vectorize.""" + encoder = _DefaultVectorizeVSDocumentEncoder() + decoded_doc = encoder.decode(ASTRA_DEFAULT_DOCUMENT_VECTORIZE) + assert decoded_doc == LC_DOCUMENT + + def test_default_vectorize_filtering(self) -> None: + """Test filter-encoding for default, vectorize.""" + encoder = _DefaultVectorizeVSDocumentEncoder() + encoded_flt = encoder.encode_filter(LC_FILTER) + assert encoded_flt == ASTRA_DEFAULT_FILTER