|
87 | 87 | ERROR_LEXICAL_QUERY_ON_NONHYBRID_SEARCH = ( |
88 | 88 | "Parameter 'lexical_query' cannot be passed for a non-hybrid search" |
89 | 89 | ) |
| 90 | +# Warning message for retrieving scores on a hybrid search |
| 91 | +WARNING_HYBRID_SEARCH_WITH_SCORES = ( |
| 92 | + "Scores returned as part of a hybrid search, which come from the " |
| 93 | + "reranking step, may not be deterministically computed solely based " |
| 94 | + "on the query and result. Using the scores e.g. for " |
| 95 | + "threshold-filtering may lead to unpredictable results and is discouraged." |
| 96 | +) |
90 | 97 |
|
91 | 98 | logger = logging.getLogger(__name__) |
92 | 99 |
|
@@ -2459,7 +2466,7 @@ def similarity_search( |
2459 | 2466 | """ |
2460 | 2467 | return [ |
2461 | 2468 | doc |
2462 | | - for (doc, _, _) in self.similarity_search_with_score_id( |
| 2469 | + for (doc, _, _) in self._similarity_search_with_score_id_impl( |
2463 | 2470 | query=query, |
2464 | 2471 | k=k, |
2465 | 2472 | filter=filter, |
@@ -2488,24 +2495,26 @@ def similarity_search_with_score( |
2488 | 2495 | Returns: |
2489 | 2496 | The list of (Document, score), the most similar to the query vector. |
2490 | 2497 | """ |
| 2498 | + if self.hybrid_search: |
| 2499 | + warnings.warn(WARNING_HYBRID_SEARCH_WITH_SCORES, stacklevel=2) |
2491 | 2500 | return [ |
2492 | 2501 | (doc, score) |
2493 | | - for (doc, score, _) in self.similarity_search_with_score_id( |
| 2502 | + for (doc, score, _) in self._similarity_search_with_score_id_impl( |
2494 | 2503 | query=query, |
2495 | 2504 | k=k, |
2496 | 2505 | filter=filter, |
2497 | 2506 | lexical_query=lexical_query, |
2498 | 2507 | ) |
2499 | 2508 | ] |
2500 | 2509 |
|
2501 | | - def similarity_search_with_score_id( |
| 2510 | + def _similarity_search_with_score_id_impl( |
2502 | 2511 | self, |
2503 | 2512 | query: str, |
2504 | 2513 | k: int = 4, |
2505 | 2514 | filter: dict[str, Any] | None = None, # noqa: A002 |
2506 | 2515 | lexical_query: str | None = None, |
2507 | 2516 | ) -> list[tuple[Document, float, str]]: |
2508 | | - """Return docs most similar to the query with score and id. |
| 2517 | + """Implementation for similarity_search_with_score_id. |
2509 | 2518 |
|
2510 | 2519 | Args: |
2511 | 2520 | query: Query to look up documents similar to. |
@@ -2561,6 +2570,35 @@ def similarity_search_with_score_id( |
2561 | 2570 | filter_dict=filter, |
2562 | 2571 | ) |
2563 | 2572 |
|
| 2573 | + def similarity_search_with_score_id( |
| 2574 | + self, |
| 2575 | + query: str, |
| 2576 | + k: int = 4, |
| 2577 | + filter: dict[str, Any] | None = None, # noqa: A002 |
| 2578 | + lexical_query: str | None = None, |
| 2579 | + ) -> list[tuple[Document, float, str]]: |
| 2580 | + """Return docs most similar to the query with score and id. |
| 2581 | +
|
| 2582 | + Args: |
| 2583 | + query: Query to look up documents similar to. |
| 2584 | + k: Number of Documents to return. Defaults to 4. |
| 2585 | + filter: Filter on the metadata to apply. |
| 2586 | + lexical_query: for hybrid search, a specific query for the lexical |
| 2587 | + portion of the retrieval. If omitted or empty, defaults to the same |
| 2588 | + as 'query'. If passed on a non-hybrid search, an error is raised. |
| 2589 | +
|
| 2590 | + Returns: |
| 2591 | + The list of (Document, score, id), the most similar to the query. |
| 2592 | + """ |
| 2593 | + if self.hybrid_search: |
| 2594 | + warnings.warn(WARNING_HYBRID_SEARCH_WITH_SCORES, stacklevel=2) |
| 2595 | + return self._similarity_search_with_score_id_impl( |
| 2596 | + query=query, |
| 2597 | + k=k, |
| 2598 | + filter=filter, |
| 2599 | + lexical_query=lexical_query, |
| 2600 | + ) |
| 2601 | + |
2564 | 2602 | @override |
2565 | 2603 | def similarity_search_by_vector( |
2566 | 2604 | self, |
@@ -2727,7 +2765,7 @@ async def asimilarity_search( |
2727 | 2765 | """ |
2728 | 2766 | return [ |
2729 | 2767 | doc |
2730 | | - for (doc, _, _) in await self.asimilarity_search_with_score_id( |
| 2768 | + for (doc, _, _) in await self._asimilarity_search_with_score_id_impl( |
2731 | 2769 | query=query, |
2732 | 2770 | k=k, |
2733 | 2771 | filter=filter, |
@@ -2756,24 +2794,26 @@ async def asimilarity_search_with_score( |
2756 | 2794 | Returns: |
2757 | 2795 | The list of (Document, score), the most similar to the query vector. |
2758 | 2796 | """ |
| 2797 | + if self.hybrid_search: |
| 2798 | + warnings.warn(WARNING_HYBRID_SEARCH_WITH_SCORES, stacklevel=2) |
2759 | 2799 | return [ |
2760 | 2800 | (doc, score) |
2761 | | - for (doc, score, _) in await self.asimilarity_search_with_score_id( |
| 2801 | + for (doc, score, _) in await self._asimilarity_search_with_score_id_impl( |
2762 | 2802 | query=query, |
2763 | 2803 | k=k, |
2764 | 2804 | filter=filter, |
2765 | 2805 | lexical_query=lexical_query, |
2766 | 2806 | ) |
2767 | 2807 | ] |
2768 | 2808 |
|
2769 | | - async def asimilarity_search_with_score_id( |
| 2809 | + async def _asimilarity_search_with_score_id_impl( |
2770 | 2810 | self, |
2771 | 2811 | query: str, |
2772 | 2812 | k: int = 4, |
2773 | 2813 | filter: dict[str, Any] | None = None, # noqa: A002 |
2774 | 2814 | lexical_query: str | None = None, |
2775 | 2815 | ) -> list[tuple[Document, float, str]]: |
2776 | | - """Return docs most similar to the query with score and id. |
| 2816 | + """Implementation for asimilarity_search_with_score_id. |
2777 | 2817 |
|
2778 | 2818 | Args: |
2779 | 2819 | query: Query to look up documents similar to. |
@@ -2829,6 +2869,35 @@ async def asimilarity_search_with_score_id( |
2829 | 2869 | filter_dict=filter, |
2830 | 2870 | ) |
2831 | 2871 |
|
| 2872 | + async def asimilarity_search_with_score_id( |
| 2873 | + self, |
| 2874 | + query: str, |
| 2875 | + k: int = 4, |
| 2876 | + filter: dict[str, Any] | None = None, # noqa: A002 |
| 2877 | + lexical_query: str | None = None, |
| 2878 | + ) -> list[tuple[Document, float, str]]: |
| 2879 | + """Return docs most similar to the query with score and id. |
| 2880 | +
|
| 2881 | + Args: |
| 2882 | + query: Query to look up documents similar to. |
| 2883 | + k: Number of Documents to return. Defaults to 4. |
| 2884 | + filter: Filter on the metadata to apply. |
| 2885 | + lexical_query: for hybrid search, a specific query for the lexical |
| 2886 | + portion of the retrieval. If omitted or empty, defaults to the same |
| 2887 | + as 'query'. If passed on a non-hybrid search, an error is raised. |
| 2888 | +
|
| 2889 | + Returns: |
| 2890 | + The list of (Document, score, id), the most similar to the query. |
| 2891 | + """ |
| 2892 | + if self.hybrid_search: |
| 2893 | + warnings.warn(WARNING_HYBRID_SEARCH_WITH_SCORES, stacklevel=2) |
| 2894 | + return await self._asimilarity_search_with_score_id_impl( |
| 2895 | + query=query, |
| 2896 | + k=k, |
| 2897 | + filter=filter, |
| 2898 | + lexical_query=lexical_query, |
| 2899 | + ) |
| 2900 | + |
2832 | 2901 | @override |
2833 | 2902 | async def asimilarity_search_by_vector( |
2834 | 2903 | self, |
|
0 commit comments