3232
3333
3434class BaseBigQueryVectorStore (VectorStore , BaseModel , ABC ):
35- """
36- Abstract base class for BigQuery-based vector stores.
35+ """Abstract base class for BigQuery-based vector stores.
3736
3837 This class provides a foundation for storing, retrieving, and searching documents
3938 and their corresponding embeddings in BigQuery.
4039
41- Attributes:
42- embedding: Embedding model for generating and comparing embeddings.
43- project_id: Google Cloud Project ID where BigQuery resources are located.
44- dataset_name: BigQuery dataset name.
45- table_name: BigQuery table name.
46- location: BigQuery region/location.
47- content_field: Name of the column storing document content (default: "content").
48- embedding_field: Name of the column storing text embeddings (default:
49- "embedding").
50- temp_dataset_name: Name of the BigQuery dataset to be used to upload temporary
51- BQ tables. If None, will default to "{dataset_name}_temp".
52- doc_id_field: Name of the column storing document IDs (default: "doc_id").
53- credentials: Optional Google Cloud credentials object.
54- embedding_dimension: Dimension of the embedding vectors (inferred if not
55- provided).
56-
5740 Abstract Methods:
5841 sync_data: Synchronizes data between the vector store and BigQuery.
5942 get_documents: Retrieves documents based on IDs or filters.
@@ -62,21 +45,51 @@ class BaseBigQueryVectorStore(VectorStore, BaseModel, ABC):
6245 """
6346
6447 model_config = ConfigDict (arbitrary_types_allowed = True )
48+
6549 embedding : Embeddings
50+ """Embedding model for generating and comparing embeddings."""
51+
6652 project_id : str
53+ """Google Cloud Project ID where BigQuery resources are located."""
54+
6755 dataset_name : str
56+ """BigQuery dataset name."""
57+
6858 table_name : str
59+ """BigQuery table name."""
60+
6961 location : str
62+ """BigQuery region/location."""
63+
7064 content_field : str = "content"
65+ """Name of the column storing document content."""
66+
7167 embedding_field : str = "embedding"
68+ """Name of the column storing text embeddings."""
69+
7270 doc_id_field : str = "doc_id"
71+ """Name of the column storing document IDs."""
72+
7373 temp_dataset_name : Optional [str ] = None
74+ """Name of the BigQuery dataset to be used to upload temporary BQ tables.
75+
76+ If `None`, will default to `'{dataset_name}_temp'`.
77+ """
78+
7479 credentials : Optional [Any ] = None
80+ """Optional Google Cloud credentials object."""
81+
7582 embedding_dimension : Optional [int ] = None
83+ """Dimension of the embedding vectors (inferred if not provided)."""
84+
7685 extra_fields : Union [Dict [str , str ], None ] = None
86+
7787 table_schema : Any = None
88+
7889 _bq_client : Any = None
90+
7991 _logger : Any = None
92+
8093 _full_table_id : Optional [str ] = None
8194
8295 @abstractmethod
@@ -249,8 +262,9 @@ def add_texts( # type: ignore[override]
249262 Args:
250263 texts: List of strings to add to the `VectorStore`.
251264 metadatas: Optional list of metadata records associated with the texts.
252- (ie [{"url": "www.myurl1.com", "title": "title1"},
253- {"url": "www.myurl2.com", "title": "title2"}])
265+
266+ (i.e. `[{"url": "www.myurl1.com", "title": "title1"},
267+ {"url": "www.myurl2.com", "title": "title2"}]`)
254268
255269 Returns:
256270 List of IDs from adding the texts into the `VectorStore`.
@@ -266,18 +280,19 @@ def add_texts_with_embeddings(
266280 embs : List [List [float ]],
267281 metadatas : Optional [List [dict ]] = None ,
268282 ) -> List [str ]:
269- """Add precomputed embeddings and relative texts / metadatas to the `VectorStore`.
283+ """Add precomputed embeddings & relative texts / metadatas to the `VectorStore`.
270284
271285 Args:
272286 ids: List of unique IDs in string format
273287 texts: List of strings to add to the `VectorStore`.
274288 embs: List of lists of floats with text embeddings for texts.
275289 metadatas: Optional list of metadata records associated with the texts.
276- (ie `[{"url": "www.myurl1.com", "title": "title1"},
290+
291+ (i.e. `[{"url": "www.myurl1.com", "title": "title1"},
277292 {"url": "www.myurl2.com", "title": "title2"}]`)
278293 Returns:
279294 List of IDs from adding the texts into the `VectorStore`.
280- """ # noqa: E501
295+ """
281296 import pandas as pd
282297
283298 ids = [uuid .uuid4 ().hex for _ in texts ]
@@ -360,21 +375,21 @@ def similarity_search_by_vectors(
360375 with_embeddings : bool = False ,
361376 ** kwargs : Any ,
362377 ) -> Any :
363- """Core similarity search function. Handles a list of embedding vectors,
364- optionally returning scores and embeddings.
378+ """Core similarity search function.
379+
380+ Handles a list of embedding vectors, optionally returning scores and embeddings.
365381
366382 Args:
367- embeddings: A list of embedding vectors, where each vector is a list of
383+ embeddings: List of embedding vectors, where each vector is a list of
368384 floats.
369- filter: (Optional) A dictionary specifying filtering criteria for the
370- documents.
371- Ie. {"title": "mytitle"}
372- k: (Optional) The number of top-ranking similar documents to return per
373- embedding. Defaults to 5.
374- with_scores: (Optional) If True, include similarity scores in the result
375- for each matched document. Defaults to False.
376- with_embeddings: (Optional) If True, include the matched document's
377- embedding vector in the result. Defaults to False.
385+ filter: Dictionary specifying filtering criteria for the documents.
386+
387+ i.e. `{"title": "mytitle"}`
388+ k: Number of top-ranking similar documents to return per embedding.
389+ with_scores: If `True`, include similarity scores in the result for each
390+ matched document.
391+ with_embeddings: If `True`, include the matched document's embedding vector
392+ in the result.
378393 Returns:
379394 A list of `k` documents for each embedding in `embeddings`
380395 """
@@ -406,10 +421,11 @@ def similarity_search_by_vector(
406421
407422 Args:
408423 embedding: Embedding to look up documents similar to.
409- filter: (Optional) A dictionary specifying filtering criteria for the
410- documents. Ie. {"title": "mytitle"}
411- k: (Optional) The number of top-ranking similar documents to return per
412- embedding. Defaults to 5.
424+ filter: Dictionary specifying filtering criteria for the documents.
425+
426+ i.e. `{"title": "mytitle"}`
427+ k: Number of top-ranking similar documents to return per embedding.
428+
413429 Returns:
414430 Return docs most similar to embedding vector.
415431 """
@@ -427,10 +443,11 @@ def similarity_search_by_vector_with_score(
427443
428444 Args:
429445 embedding: Embedding to look up documents similar to.
430- filter: (Optional) A dictionary specifying filtering criteria for the
431- documents. Ie. {"title": "mytitle"}
432- k: (Optional) The number of top-ranking similar documents to return per
433- embedding. Defaults to 5.
446+ filter: Dictionary specifying filtering criteria for the documents.
447+
448+ i.e. `{"title": "mytitle"}`
449+ k: The number of top-ranking similar documents to return per embedding.
450+
434451 Returns:
435452 Return docs most similar to embedding vector.
436453 """
@@ -444,11 +461,12 @@ def similarity_search(
444461 """Search for top `k` docs most similar to input query.
445462
446463 Args:
447- query: search query to search documents with.
448- filter: (Optional) A dictionary specifying filtering criteria for the
449- documents. Ie. {"title": "mytitle"}
450- k: (Optional) The number of top-ranking similar documents to return per
451- embedding. Defaults to 5.
464+ query: Search query to search documents with.
465+ filter: Dictionary specifying filtering criteria for the documents.
466+
467+ i.e. `{"title": "mytitle"}`
468+ k: The number of top-ranking similar documents to return per embedding.
469+
452470 Returns:
453471 Return docs most similar to input query.
454472 """
@@ -468,11 +486,12 @@ def similarity_search_with_score(
468486 scores.
469487
470488 Args:
471- query: search query to search documents with.
472- filter: (Optional) A dictionary specifying filtering criteria for the
473- documents. Ie. {"title": "mytitle"}
474- k: (Optional) The number of top-ranking similar documents to return per
475- embedding. Defaults to 5.
489+ query: Search query to search documents with.
490+ filter: Dictionary specifying filtering criteria for the documents.
491+
492+ i.e. `{"title": "mytitle"}`
493+ k: The number of top-ranking similar documents to return per embedding.
494+
476495 Returns:
477496 Return docs most similar to input query along with scores.
478497 """
@@ -506,20 +525,23 @@ def max_marginal_relevance_search(
506525
507526 Args:
508527 **kwargs:
509- query: search query text.
528+ query: Search query text.
510529 filter: Filter on metadata properties, e.g.
511- {
512- "str_property": "foo",
513- "int_property": 123
514- }
515- k: Number of Documents to return. Defaults to 5.
530+
531+ ```json
532+ {
533+ "str_property": "foo",
534+ "int_property": 123
535+ }
536+ ```
537+ k: Number of documents to return.
516538 fetch_k: Number of `Document` objects to fetch to pass to MMR algorithm.
517- lambda_mult: Number between `0` and `1` that determines the degree
518- of diversity among the results with 0 corresponding
519- to maximum diversity and `1` to minimum diversity.
520- Defaults to 0.5.
539+ lambda_mult: Number between `0` and `1` that determines the degree of
540+ diversity among the results with 0 corresponding to maximum diversity
541+ and `1` to minimum diversity.
542+
521543 Returns:
522- List of Documents selected by maximal marginal relevance.
544+ List of documents selected by maximal marginal relevance.
523545 """
524546 embedding = self .embedding .embed_query (query )
525547 return self .max_marginal_relevance_search_by_vector (
@@ -542,15 +564,19 @@ def max_marginal_relevance_search_by_vector(
542564 Args:
543565 embedding: Embedding to look up documents similar to.
544566 filter: Filter on metadata properties, e.g.
545- {
546- "str_property": "foo",
547- "int_property": 123
548- }
549- k: Number of Documents to return.
567+
568+ ```json
569+ {
570+ "str_property": "foo",
571+ "int_property": 123
572+ }
573+ ```
574+ k: Number of documents to return.
550575 fetch_k: Number of `Document` objects to fetch to pass to MMR algorithm.
551576 lambda_mult: Number between `0` and `1` that determines the degree
552577 of diversity among the results with 0 corresponding
553578 to maximum diversity and `1` to minimum diversity.
579+
554580 Returns:
555581 List of Documents selected by maximal marginal relevance.
556582 """
0 commit comments