Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 42 additions & 41 deletions libs/astradb/langchain_astradb/vectorstores.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,20 +383,20 @@ def __init__(
Args:
embedding: the embeddings function or service to use.
This enables client-side embedding functions or calls to external
embedding providers. If `embedding` is provided, arguments
`collection_vector_service_options` and
`collection_embedding_api_key` cannot be provided.
embedding providers. If ``embedding`` is provided, arguments
``collection_vector_service_options`` and
``collection_embedding_api_key`` cannot be provided.
collection_name: name of the Astra DB collection to create/use.
token: API token for Astra DB usage, either in the form of a string
or a subclass of `astrapy.authentication.TokenProvider`.
or a subclass of ``astrapy.authentication.TokenProvider``.
If not provided, the environment variable
ASTRA_DB_APPLICATION_TOKEN is inspected.
api_endpoint: full URL to the API endpoint, such as
`https://<DB-ID>-us-east1.apps.astra.datastax.com`. If not provided,
``https://<DB-ID>-us-east1.apps.astra.datastax.com``. If not provided,
the environment variable ASTRA_DB_API_ENDPOINT is inspected.
environment: a string specifying the environment of the target Data API.
If omitted, defaults to "prod" (Astra DB production).
Other values are in `astrapy.constants.Environment` enum class.
Other values are in ``astrapy.constants.Environment`` enum class.
astra_db_client:
*DEPRECATED starting from version 0.3.5.*
*Please use 'token', 'api_endpoint' and optionally 'environment'.*
Expand Down Expand Up @@ -436,18 +436,18 @@ def __init__(
(see docs.datastax.com/en/astra/astra-db-vector/api-reference/
data-api-commands.html#advanced-feature-indexing-clause-on-createcollection)
collection_vector_service_options: specifies the use of server-side
embeddings within Astra DB. If passing this parameter, `embedding`
embeddings within Astra DB. If passing this parameter, ``embedding``
cannot be provided.
collection_embedding_api_key: for usage of server-side embeddings
within Astra DB. With this parameter one can supply an API Key
that will be passed to Astra DB with each data request.
This parameter can be either a string or a subclass of
`astrapy.authentication.EmbeddingHeadersProvider`.
``astrapy.authentication.EmbeddingHeadersProvider``.
This is useful when the service is configured for the collection,
but no corresponding secret is stored within
Astra's key management system.
This parameter cannot be provided without
specifying `collection_vector_service_options`.
specifying ``collection_vector_service_options``.
content_field: name of the field containing the textual content
in the documents when saved on Astra DB. For vectorize collections,
this cannot be specified; for non-vectorize collection, defaults
Expand All @@ -457,36 +457,36 @@ def __init__(
guessed by inspection of a few documents from the collection, under the
assumption that the longer strings are the most likely candidates.
Please understand the limitations of this method and get some
understanding of your data before passing `"*"` for this parameter.
understanding of your data before passing ``"*"`` for this parameter.
ignore_invalid_documents: if False (default), exceptions are raised
when a document is found on the Astra DB collectin that does
not have the expected shape. If set to True, such results
from the database are ignored and a warning is issued. Note
that in this case a similarity search may end up returning fewer
results than the required `k`.
results than the required ``k``.
autodetect_collection: if True, turns on autodetect behavior.
The store will look for an existing collection of the provided name
and infer the store settings from it. Default is False.
In autodetect mode, `content_field` can be given as "*", meaning
that an attempt will be made to determine it by inspection
(unless vectorize is enabled, in which case `content_field` is ignored).
In autodetect mode, ``content_field`` can be given as ``"*"``, meaning
that an attempt will be made to determine it by inspection (unless
vectorize is enabled, in which case ``content_field`` is ignored).
In autodetect mode, the store not only determines whether embeddings
are client- or server-side, but - most importantly - switches
automatically between "nested" and "flat" representations of documents
on DB (i.e. having the metadata key-value pairs grouped in a `metadata`
field or spread at the documents' top-level). The former scheme
is the native mode of the AstraDBVectorStore; the store resorts
on DB (i.e. having the metadata key-value pairs grouped in a
``metadata`` field or spread at the documents' top-level). The former
scheme is the native mode of the AstraDBVectorStore; the store resorts
to the latter in case of vector collections populated with external
means (such as a third-party data import tool) before applying
an AstraDBVectorStore to them.
Note that the following parameters cannot be used if this is True:
`metric`, `setup_mode`, `metadata_indexing_include`,
`metadata_indexing_exclude`, `collection_indexing_policy`,
`collection_vector_service_options`.
``metric``, ``setup_mode``, ``metadata_indexing_include``,
``metadata_indexing_exclude``, ``collection_indexing_policy``,
``collection_vector_service_options``.

Note:
For concurrency in synchronous :meth:`~add_texts`:, as a rule of thumb, on a
typical client machine it is suggested to keep the quantity
For concurrency in synchronous :meth:``~add_texts``:, as a rule of thumb,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it should remain single tick 😉

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ouch! ahahah. Fixing again :)

on a typical client machine it is suggested to keep the quantity
bulk_insert_batch_concurrency * bulk_insert_overwrite_concurrency
much below 1000 to avoid exhausting the client multithreading/networking
resources. The hardcoded defaults are somewhat conservative to meet
Expand All @@ -499,7 +499,7 @@ def __init__(
depending on both the machine/network specs and the expected workload
(specifically, how often a write is an update of an existing id).
Remember you can pass concurrency settings to individual calls to
:meth:`~add_texts` and :meth:`~add_documents` as well.
:meth:``~add_texts`` and :meth:``~add_documents`` as well.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep single ticks

"""
# general collection settings
self.collection_name = collection_name
Expand Down Expand Up @@ -820,7 +820,7 @@ def delete_collection(self) -> None:
"""Completely delete the collection from the database.

Completely delete the collection from the database (as opposed
to :meth:`~clear`, which empties it only).
to :meth:``~clear``, which empties it only).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single tick

Stored data is lost and unrecoverable, resources are freed.
Use with caution.
"""
Expand All @@ -831,7 +831,7 @@ async def adelete_collection(self) -> None:
"""Completely delete the collection from the database.

Completely delete the collection from the database (as opposed
to :meth:`~aclear`, which empties it only).
to :meth:``~aclear``, which empties it only).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single tick

Stored data is lost and unrecoverable, resources are freed.
Use with caution.
"""
Expand Down Expand Up @@ -933,7 +933,7 @@ def add_texts(
Note:
There are constraints on the allowed field names
in the metadata dictionaries, coming from the underlying Astra DB API.
For instance, the `$` (dollar sign) cannot be used in the dict keys.
For instance, the ``$`` (dollar sign) cannot be used in the dict keys.
See this document for details:
https://docs.datastax.com/en/astra/astra-db-vector/api-reference/data-api.html

Expand Down Expand Up @@ -1055,7 +1055,7 @@ async def aadd_texts(
Note:
There are constraints on the allowed field names
in the metadata dictionaries, coming from the underlying Astra DB API.
For instance, the `$` (dollar sign) cannot be used in the dict keys.
For instance, the ``$`` (dollar sign) cannot be used in the dict keys.
See this document for details:
https://docs.datastax.com/en/astra/astra-db-vector/api-reference/data-api.html

Expand Down Expand Up @@ -1833,12 +1833,13 @@ def from_texts(
metadatas: metadata dicts for the texts.
ids: ids to associate to the texts.
**kwargs: you can pass any argument that you would
to :meth:`~add_texts` and/or to the 'AstraDBVectorStore' constructor
(see these methods for details). These arguments will be
to :meth:``~add_texts`` and/or to the
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single tick

``AstraDBVectorStore`` constructor (see these methods for
details). These arguments will be
routed to the respective methods as they are.

Returns:
an `AstraDBVectorStore` vectorstore.
an ``AstraDBVectorStore`` vectorstore.
"""
_add_texts_inspection = inspect.getfullargspec(AstraDBVectorStore.add_texts)
_method_args = (
Expand Down Expand Up @@ -1877,12 +1878,12 @@ async def afrom_texts(
metadatas: metadata dicts for the texts.
ids: ids to associate to the texts.
**kwargs: you can pass any argument that you would
to :meth:`~aadd_texts` and/or to the 'AstraDBVectorStore' constructor
(see these methods for details). These arguments will be
routed to the respective methods as they are.
to :meth:``~aadd_texts`` and/or to the ``AstraDBVectorStore``
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Single tick for aadd_texts

constructor (see these methods for details). These arguments
will be routed to the respective methods as they are.

Returns:
an `AstraDBVectorStore` vectorstore.
an ``AstraDBVectorStore`` vectorstore.
"""
_aadd_texts_inspection = inspect.getfullargspec(AstraDBVectorStore.aadd_texts)
_method_args = (
Expand Down Expand Up @@ -1913,13 +1914,13 @@ def from_documents(
) -> AstraDBVectorStore:
"""Create an Astra DB vectorstore from a document list.

Utility method that defers to 'from_texts' (see that one).
Utility method that defers to ``from_texts`` (see that one).
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: could use :meth: here for better reference.


Args: see 'from_texts', except here you have to supply 'documents'
in place of 'texts' and 'metadatas'.
Args: see ``from_texts``, except here you have to supply 'documents'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR: incorrect syntax for Args

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing this while I'm at it ...

in place of ``texts`` and ``metadatas``.

Returns:
an `AstraDBVectorStore` vectorstore.
an ``AstraDBVectorStore`` vectorstore.
"""
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
Expand All @@ -1941,11 +1942,11 @@ async def afrom_documents(

Utility method that defers to 'afrom_texts' (see that one).

Args: see 'afrom_texts', except here you have to supply 'documents'
in place of 'texts' and 'metadatas'.
Args: see ``afrom_texts``, except here you have to supply ``documents``
in place of ``texts`` and ``metadatas``.

Returns:
an `AstraDBVectorStore` vectorstore.
an ``AstraDBVectorStore`` vectorstore.
"""
texts = [d.page_content for d in documents]
metadatas = [d.metadata for d in documents]
Expand Down
2 changes: 1 addition & 1 deletion libs/astradb/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-astradb"
version = "0.3.5"
version = "0.4.0"
description = "An integration package connecting Astra DB and LangChain"
authors = []
readme = "README.md"
Expand Down
108 changes: 38 additions & 70 deletions libs/astradb/tests/unit_tests/test_vs_autodetect_inferences.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,79 +78,48 @@
({"$vector": [0], "_id": "a", "$vectorize": "a", "x": 9}, True),
]
DOC_FLATNESS_TEST_IDS = [f"DOC=<{json.dumps(doc)}>" for doc, _ in DOC_FLATNESS_PAIRS]

ff = FLAT_DOCUMENT
df = DEEP_DOCUMENT # noqa: PD901
uf = UNKNOWN_FLATNESS_DOCUMENT
DOCS_FLATNESS_PAIRS = [
([], False),
([uf], False),
([uf, uf], False),
([df], False),
([df, df], False),
([df, uf], False),
([ff], True),
([ff, ff], True),
([ff, uf], True),
([ff, df], ValueError()),
]
DOCS_FLATNESS_TEST_IDS = [
" docs=[] ",
" docs=[u] ",
" docs=[u, u] ",
" docs=[d] ",
" docs=[d, d] ",
" docs=[d, u] ",
" docs=[f] ",
" docs=[f, f] ",
" docs=[f, u] ",
" docs=[f, d] ",
]
DOC_CF_PAIRS = [
(DOCUMENT_WITH_CF_X, "x"),
(DOCUMENT_WITH_CF_Y, "y"),
(DOCUMENT_WITH_UNKNOWN_CF, None),
({"x": "LL", "_id": "a"}, "x"),
({"x": 1234, "_id": "a"}, None),
({"_id": "a"}, None),
DOCS_FLATNESS_TEST_PARAMS = [
pytest.param([], False, id=" docs=[] "),
pytest.param([uf], False, id=" docs=[u] "),
pytest.param([uf, uf], False, id=" docs=[u, u] "),
pytest.param([df], False, id=" docs=[d] "),
pytest.param([df, df], False, id=" docs=[d, d] "),
pytest.param([df, uf], False, id=" docs=[d, u] "),
pytest.param([ff], True, id=" docs=[f] "),
pytest.param([ff, ff], True, id=" docs=[f, f] "),
pytest.param([ff, uf], True, id=" docs=[f, u] "),
pytest.param([ff, df], ValueError(), id=" docs=[f, d] "),
]
DOC_CF_TEST_IDS = [
"cf=x",
"cf=y",
"unknown-cf",
"only-x",
"x-is-number",
"no-fields",

DOC_CF_TEST_PARAMS = [
pytest.param(DOCUMENT_WITH_CF_X, "x", id="cf=x"),
pytest.param(DOCUMENT_WITH_CF_Y, "y", id="cf=y"),
pytest.param(DOCUMENT_WITH_UNKNOWN_CF, None, id="unknown-cf"),
pytest.param({"x": "LL", "_id": "a"}, "x", id="only-x"),
pytest.param({"x": 1234, "_id": "a"}, None, id="x-is-number"),
pytest.param({"_id": "a"}, None, id="no-fields"),
]

xc = DOCUMENT_WITH_CF_X
yc = DOCUMENT_WITH_CF_Y
uc = DOCUMENT_WITH_UNKNOWN_CF
DOCS_CF_TRIPLES = [
([], "q", "q"),
([xc], "q", "q"),
([xc, xc, yc], "q", "q"),
([uc, uc], "q", "q"),
([xc, uc, uc], "q", "q"),
([xc, xc, yc, uc, uc, uc], "q", "q"),
([], "*", ValueError),
([xc], "*", "x"),
([xc, xc, yc], "*", "x"),
([uc, uc], "*", ValueError),
([xc, uc, uc], "*", "x"),
([xc, xc, yc, uc, uc, uc], "*", "x"),
]
DOCS_CF_TEST_IDS = [
"[]",
"[x]",
"[x, x, y]",
"[u, u]",
"[x, u, u]",
"[x, x, y, u, u, u]",
"[]",
"[x]",
"[x, x, y]",
"[u, u]",
"[x, u, u]",
"[x, x, y, u, u, u]",
DOCS_CF_TEST_PARAMS = [
pytest.param([], "q", "q", id=" [],req='q' "),
pytest.param([xc], "q", "q", id=" [x],req='q' "),
pytest.param([xc, xc, yc], "q", "q", id=" [x, x, y],req='q' "),
pytest.param([uc, uc], "q", "q", id=" [u, u],req='q' "),
pytest.param([xc, uc, uc], "q", "q", id=" [x, u, u],req='q' "),
pytest.param([xc, xc, yc, uc, uc, uc], "q", "q", id=" [x, x, y, u, u, u],req='q' "),
pytest.param([], "*", ValueError, id=" [],req='*' "),
pytest.param([xc], "*", "x", id=" [x],req='*' "),
pytest.param([xc, xc, yc], "*", "x", id=" [x, x, y],req='*' "),
pytest.param([uc, uc], "*", ValueError, id=" [u, u],req='*' "),
pytest.param([xc, uc, uc], "*", "x", id=" [x, u, u],req='*' "),
pytest.param([xc, xc, yc, uc, uc, uc], "*", "x", id=" [x, x, y, u, u, u],req='*' "),
]


Expand All @@ -168,8 +137,7 @@ def test_detect_document_flatness(

@pytest.mark.parametrize(
("documents", "expected_flatness"),
DOCS_FLATNESS_PAIRS,
ids=DOCS_FLATNESS_TEST_IDS,
DOCS_FLATNESS_TEST_PARAMS,
)
def test_detect_documents_flatness(
self,
Expand All @@ -184,7 +152,8 @@ def test_detect_documents_flatness(
_detect_documents_flatness(documents)

@pytest.mark.parametrize(
("document", "expected_content_field"), DOC_CF_PAIRS, ids=DOC_CF_TEST_IDS
("document", "expected_content_field"),
DOC_CF_TEST_PARAMS,
)
def test_detect_document_content_field(
self,
Expand All @@ -201,8 +170,7 @@ def test_detect_document_content_field(

@pytest.mark.parametrize(
("documents", "requested_content_field", "expected_content_field"),
DOCS_CF_TRIPLES,
ids=DOCS_CF_TEST_IDS,
DOCS_CF_TEST_PARAMS,
)
def test_detect_documents_content_field(
self,
Expand Down