Add support for MongoDB Atlas $vectorSearch vector search (#11139)

Adds support for the `$vectorSearch` operator for MongoDBAtlasVectorSearch, which was announced at .Local London (September 26th, 2023). This change maintains breaks compatibility support for the existing `$search` operator used by the original integration (https://github.com/langchain-ai/langchain/pull/5338) due to incompatibilities in the Atlas search implementations. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
11 months ago · 2c952de21a
parent b599f91e33
commit 2c952de21a
2 changed files with 77 additions and 59 deletions
--- a/libs/langchain/langchain/vectorstores/mongodb_atlas.py
+++ b/libs/langchain/langchain/vectorstores/mongodb_atlas.py
@ -89,6 +89,18 @@ class MongoDBAtlasVectorSearch(VectorStore):
        embedding: Embeddings,
        **kwargs: Any,
    ) -> MongoDBAtlasVectorSearch:
+        """Construct a `MongoDB Atlas Vector Search` vector store
+        from a MongoDB connection URI.
+
+        Args:
+            connection_string: A valid MongoDB connection URI.
+            namespace: A valid MongoDB namespace (database and collection).
+            embedding: The text embedding model to use for the vector store.
+
+        Returns:
+            A new MongoDBAtlasVectorSearch instance.
+
+        """
        try:
            from pymongo import MongoClient
        except ImportError:
@ -149,24 +161,23 @@ class MongoDBAtlasVectorSearch(VectorStore):
        self,
        embedding: List[float],
        k: int = 4,
-        pre_filter: Optional[dict] = None,
+        pre_filter: Optional[Dict] = None,
        post_filter_pipeline: Optional[List[Dict]] = None,
    ) -> List[Tuple[Document, float]]:
-        knn_beta = {
-            "vector": embedding,
+        params = {
+            "queryVector": embedding,
            "path": self._embedding_key,
-            "k": k,
+            "numCandidates": k * 10,
+            "limit": k,
+            "index": self._index_name,
        }
        if pre_filter:
-            knn_beta["filter"] = pre_filter
+            params["filter"] = pre_filter
+        query = {"$vectorSearch": params}
+
        pipeline = [
-            {
-                "$search": {
-                    "index": self._index_name,
-                    "knnBeta": knn_beta,
-                }
-            },
-            {"$set": {"score": {"$meta": "searchScore"}}},
+            query,
+            {"$set": {"score": {"$meta": "vectorSearchScore"}}},
        ]
        if post_filter_pipeline is not None:
            pipeline.extend(post_filter_pipeline)
@ -183,12 +194,12 @@ class MongoDBAtlasVectorSearch(VectorStore):
        query: str,
        *,
        k: int = 4,
-        pre_filter: Optional[dict] = None,
+        pre_filter: Optional[Dict] = None,
        post_filter_pipeline: Optional[List[Dict]] = None,
    ) -> List[Tuple[Document, float]]:
-        """Return MongoDB documents most similar to query, along with scores.
+        """Return MongoDB documents most similar to the given query and their scores.

-        Use the knnBeta Operator available in MongoDB Atlas Search
+        Uses the knnBeta Operator available in MongoDB Atlas Search.
        This feature is in early access and available only for evaluation purposes, to
        validate functionality, and to gather feedback from a small closed group of
        early access users. It is not recommended for production deployments as we
@ -197,14 +208,14 @@ class MongoDBAtlasVectorSearch(VectorStore):

        Args:
            query: Text to look up documents similar to.
-            k: Optional Number of Documents to return. Defaults to 4.
-            pre_filter: Optional Dictionary of argument(s) to prefilter on document
-                fields.
-            post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
-                following the knnBeta search.
+            k: (Optional) number of documents to return. Defaults to 4.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter document
+                fields on.
+            post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
+                following the knnBeta vector search.

        Returns:
-            List of Documents most similar to the query and score for each
+            List of documents most similar to the query and their scores.
        """
        embedding = self._embedding.embed_query(query)
        docs = self._similarity_search_with_score(
@ -219,29 +230,29 @@ class MongoDBAtlasVectorSearch(VectorStore):
        self,
        query: str,
        k: int = 4,
-        pre_filter: Optional[dict] = None,
+        pre_filter: Optional[Dict] = None,
        post_filter_pipeline: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> List[Document]:
-        """Return MongoDB documents most similar to query.
+        """Return MongoDB documents most similar to the given query.

-        Use the knnBeta Operator available in MongoDB Atlas Search
+        Uses the knnBeta Operator available in MongoDB Atlas Search.
        This feature is in early access and available only for evaluation purposes, to
        validate functionality, and to gather feedback from a small closed group of
-        early access users. It is not recommended for production deployments as we may
-        introduce breaking changes.
+        early access users. It is not recommended for production deployments as we
+        may introduce breaking changes.
        For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta

        Args:
            query: Text to look up documents similar to.
-            k: Optional Number of Documents to return. Defaults to 4.
-            pre_filter: Optional Dictionary of argument(s) to prefilter on document
-                fields.
-            post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
-                following the knnBeta search.
+            k: (Optional) number of documents to return. Defaults to 4.
+            pre_filter: (Optional) dictionary of argument(s) to prefilter document
+                fields on.
+            post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
+                following the knnBeta vector search.

        Returns:
-            List of Documents most similar to the query and score for each
+            List of documents most similar to the query and their scores.
        """
        docs_and_scores = self.similarity_search_with_score(
            query,
@ -257,30 +268,30 @@ class MongoDBAtlasVectorSearch(VectorStore):
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
-        pre_filter: Optional[dict] = None,
+        pre_filter: Optional[Dict] = None,
        post_filter_pipeline: Optional[List[Dict]] = None,
        **kwargs: Any,
    ) -> List[Document]:
-        """Return docs selected using the maximal marginal relevance.
+        """Return documents selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            query: Text to look up documents similar to.
-            k: Optional Number of Documents to return. Defaults to 4.
-            fetch_k: Optional Number of Documents to fetch before passing to MMR
+            k: (Optional) number of documents to return. Defaults to 4.
+            fetch_k: (Optional) number of documents to fetch before passing to MMR
                algorithm. Defaults to 20.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
-            pre_filter: Optional Dictionary of argument(s) to prefilter on document
+            pre_filter: (Optional) dictionary of argument(s) to prefilter on document
                fields.
-            post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
-                following the knnBeta search.
+            post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
+                following the knnBeta vector search.
        Returns:
-            List of Documents selected by maximal marginal relevance.
+            List of documents selected by maximal marginal relevance.
        """
        query_embedding = self._embedding.embed_query(query)
        docs = self._similarity_search_with_score(
@ -303,11 +314,11 @@ class MongoDBAtlasVectorSearch(VectorStore):
        cls,
        texts: List[str],
        embedding: Embeddings,
-        metadatas: Optional[List[dict]] = None,
+        metadatas: Optional[List[Dict]] = None,
        collection: Optional[Collection[MongoDBDocumentType]] = None,
        **kwargs: Any,
    ) -> MongoDBAtlasVectorSearch:
-        """Construct MongoDBAtlasVectorSearch wrapper from raw documents.
+        """Construct a `MongoDB Atlas Vector Search` vector store from raw documents.

        This is a user-friendly interface that:
            1. Embeds documents.
--- a/libs/langchain/tests/integration_tests/vectorstores/test_mongodb_atlas.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_mongodb_atlas.py
@ -3,7 +3,7 @@ from __future__ import annotations

 import os
 from time import sleep
-from typing import TYPE_CHECKING, Any
+from typing import Any

 import pytest

@ -11,41 +11,46 @@ from langchain.docstore.document import Document
 from langchain.schema.embeddings import Embeddings
 from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch

-if TYPE_CHECKING:
-    from pymongo import MongoClient
-
 INDEX_NAME = "langchain-test-index"
 NAMESPACE = "langchain_test_db.langchain_test_collection"
 CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
 DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")

-# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
-# connections.

+def get_collection() -> Any:
+    from pymongo import MongoClient

-@pytest.fixture
-def collection() -> Any:
-    test_client = MongoClient(CONNECTION_STRING)
+    test_client: MongoClient = MongoClient(CONNECTION_STRING)
    return test_client[DB_NAME][COLLECTION_NAME]


+@pytest.fixture()
+def collection() -> Any:
+    return get_collection()
+
+
 class TestMongoDBAtlasVectorSearch:
    @classmethod
-    def setup_class(cls, collection: Any) -> None:
+    def setup_class(cls) -> None:
        # insure the test collection is empty
+        collection = get_collection()
        assert collection.count_documents({}) == 0  # type: ignore[index]  # noqa: E501

    @classmethod
-    def teardown_class(cls, collection: Any) -> None:
+    def teardown_class(cls) -> None:
+        collection = get_collection()
        # delete all the documents in the collection
        collection.delete_many({})  # type: ignore[index]

    @pytest.fixture(autouse=True)
-    def setup(self, collection: Any) -> None:
+    def setup(self) -> None:
+        collection = get_collection()
        # delete all the documents in the collection
        collection.delete_many({})  # type: ignore[index]

-    def test_from_documents(self, embedding_openai: Embeddings) -> None:
+    def test_from_documents(
+        self, embedding_openai: Embeddings, collection: Any
+    ) -> None:
        """Test end to end construction and search."""
        documents = [
            Document(page_content="Dogs are tough.", metadata={"a": 1}),
@ -64,7 +69,7 @@ class TestMongoDBAtlasVectorSearch:
        assert output[0].page_content == "What is a sandwich?"
        assert output[0].metadata["c"] == 1

-    def test_from_texts(self, embedding_openai: Embeddings) -> None:
+    def test_from_texts(self, embedding_openai: Embeddings, collection: Any) -> None:
        texts = [
            "Dogs are tough.",
            "Cats have fluff.",
@ -81,7 +86,9 @@ class TestMongoDBAtlasVectorSearch:
        output = vectorstore.similarity_search("Sandwich", k=1)
        assert output[0].page_content == "What is a sandwich?"

-    def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
+    def test_from_texts_with_metadatas(
+        self, embedding_openai: Embeddings, collection: Any
+    ) -> None:
        texts = [
            "Dogs are tough.",
            "Cats have fluff.",
@ -102,7 +109,7 @@ class TestMongoDBAtlasVectorSearch:
        assert output[0].metadata["c"] == 1

    def test_from_texts_with_metadatas_and_pre_filter(
-        self, embedding_openai: Embeddings
+        self, embedding_openai: Embeddings, collection: Any
    ) -> None:
        texts = [
            "Dogs are tough.",
@ -124,7 +131,7 @@ class TestMongoDBAtlasVectorSearch:
        )
        assert output == []

-    def test_mmr(self, embedding_openai: Embeddings) -> None:
+    def test_mmr(self, embedding_openai: Embeddings, collection: Any) -> None:
        texts = ["foo", "foo", "fou", "foy"]
        vectorstore = MongoDBAtlasVectorSearch.from_texts(
            texts,