Add support for MongoDB Atlas $vectorSearch vector search (#11139)

Adds support for the `$vectorSearch` operator for
MongoDBAtlasVectorSearch, which was announced at .Local London
(September 26th, 2023). This change maintains breaks compatibility
support for the existing `$search` operator used by the original
integration (https://github.com/langchain-ai/langchain/pull/5338) due to
incompatibilities in the Atlas search implementations.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Noah Stapp 2023-09-28 15:01:03 -07:00 committed by GitHub
parent b599f91e33
commit 2c952de21a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 77 additions and 59 deletions

View File

@ -89,6 +89,18 @@ class MongoDBAtlasVectorSearch(VectorStore):
embedding: Embeddings, embedding: Embeddings,
**kwargs: Any, **kwargs: Any,
) -> MongoDBAtlasVectorSearch: ) -> MongoDBAtlasVectorSearch:
"""Construct a `MongoDB Atlas Vector Search` vector store
from a MongoDB connection URI.
Args:
connection_string: A valid MongoDB connection URI.
namespace: A valid MongoDB namespace (database and collection).
embedding: The text embedding model to use for the vector store.
Returns:
A new MongoDBAtlasVectorSearch instance.
"""
try: try:
from pymongo import MongoClient from pymongo import MongoClient
except ImportError: except ImportError:
@ -149,24 +161,23 @@ class MongoDBAtlasVectorSearch(VectorStore):
self, self,
embedding: List[float], embedding: List[float],
k: int = 4, k: int = 4,
pre_filter: Optional[dict] = None, pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None, post_filter_pipeline: Optional[List[Dict]] = None,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
knn_beta = { params = {
"vector": embedding, "queryVector": embedding,
"path": self._embedding_key, "path": self._embedding_key,
"k": k, "numCandidates": k * 10,
"limit": k,
"index": self._index_name,
} }
if pre_filter: if pre_filter:
knn_beta["filter"] = pre_filter params["filter"] = pre_filter
query = {"$vectorSearch": params}
pipeline = [ pipeline = [
{ query,
"$search": { {"$set": {"score": {"$meta": "vectorSearchScore"}}},
"index": self._index_name,
"knnBeta": knn_beta,
}
},
{"$set": {"score": {"$meta": "searchScore"}}},
] ]
if post_filter_pipeline is not None: if post_filter_pipeline is not None:
pipeline.extend(post_filter_pipeline) pipeline.extend(post_filter_pipeline)
@ -183,12 +194,12 @@ class MongoDBAtlasVectorSearch(VectorStore):
query: str, query: str,
*, *,
k: int = 4, k: int = 4,
pre_filter: Optional[dict] = None, pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None, post_filter_pipeline: Optional[List[Dict]] = None,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return MongoDB documents most similar to query, along with scores. """Return MongoDB documents most similar to the given query and their scores.
Use the knnBeta Operator available in MongoDB Atlas Search Uses the knnBeta Operator available in MongoDB Atlas Search.
This feature is in early access and available only for evaluation purposes, to This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we early access users. It is not recommended for production deployments as we
@ -197,14 +208,14 @@ class MongoDBAtlasVectorSearch(VectorStore):
Args: Args:
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4. k: (Optional) number of documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document pre_filter: (Optional) dictionary of argument(s) to prefilter document
fields. fields on.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
following the knnBeta search. following the knnBeta vector search.
Returns: Returns:
List of Documents most similar to the query and score for each List of documents most similar to the query and their scores.
""" """
embedding = self._embedding.embed_query(query) embedding = self._embedding.embed_query(query)
docs = self._similarity_search_with_score( docs = self._similarity_search_with_score(
@ -219,29 +230,29 @@ class MongoDBAtlasVectorSearch(VectorStore):
self, self,
query: str, query: str,
k: int = 4, k: int = 4,
pre_filter: Optional[dict] = None, pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None, post_filter_pipeline: Optional[List[Dict]] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return MongoDB documents most similar to query. """Return MongoDB documents most similar to the given query.
Use the knnBeta Operator available in MongoDB Atlas Search Uses the knnBeta Operator available in MongoDB Atlas Search.
This feature is in early access and available only for evaluation purposes, to This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we may early access users. It is not recommended for production deployments as we
introduce breaking changes. may introduce breaking changes.
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
Args: Args:
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4. k: (Optional) number of documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document pre_filter: (Optional) dictionary of argument(s) to prefilter document
fields. fields on.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
following the knnBeta search. following the knnBeta vector search.
Returns: Returns:
List of Documents most similar to the query and score for each List of documents most similar to the query and their scores.
""" """
docs_and_scores = self.similarity_search_with_score( docs_and_scores = self.similarity_search_with_score(
query, query,
@ -257,30 +268,30 @@ class MongoDBAtlasVectorSearch(VectorStore):
k: int = 4, k: int = 4,
fetch_k: int = 20, fetch_k: int = 20,
lambda_mult: float = 0.5, lambda_mult: float = 0.5,
pre_filter: Optional[dict] = None, pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None, post_filter_pipeline: Optional[List[Dict]] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return docs selected using the maximal marginal relevance. """Return documents selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents. among selected documents.
Args: Args:
query: Text to look up documents similar to. query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4. k: (Optional) number of documents to return. Defaults to 4.
fetch_k: Optional Number of Documents to fetch before passing to MMR fetch_k: (Optional) number of documents to fetch before passing to MMR
algorithm. Defaults to 20. algorithm. Defaults to 20.
lambda_mult: Number between 0 and 1 that determines the degree lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity. to maximum diversity and 1 to minimum diversity.
Defaults to 0.5. Defaults to 0.5.
pre_filter: Optional Dictionary of argument(s) to prefilter on document pre_filter: (Optional) dictionary of argument(s) to prefilter on document
fields. fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
following the knnBeta search. following the knnBeta vector search.
Returns: Returns:
List of Documents selected by maximal marginal relevance. List of documents selected by maximal marginal relevance.
""" """
query_embedding = self._embedding.embed_query(query) query_embedding = self._embedding.embed_query(query)
docs = self._similarity_search_with_score( docs = self._similarity_search_with_score(
@ -303,11 +314,11 @@ class MongoDBAtlasVectorSearch(VectorStore):
cls, cls,
texts: List[str], texts: List[str],
embedding: Embeddings, embedding: Embeddings,
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[Dict]] = None,
collection: Optional[Collection[MongoDBDocumentType]] = None, collection: Optional[Collection[MongoDBDocumentType]] = None,
**kwargs: Any, **kwargs: Any,
) -> MongoDBAtlasVectorSearch: ) -> MongoDBAtlasVectorSearch:
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents. """Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
This is a user-friendly interface that: This is a user-friendly interface that:
1. Embeds documents. 1. Embeds documents.

View File

@ -3,7 +3,7 @@ from __future__ import annotations
import os import os
from time import sleep from time import sleep
from typing import TYPE_CHECKING, Any from typing import Any
import pytest import pytest
@ -11,41 +11,46 @@ from langchain.docstore.document import Document
from langchain.schema.embeddings import Embeddings from langchain.schema.embeddings import Embeddings
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
if TYPE_CHECKING:
from pymongo import MongoClient
INDEX_NAME = "langchain-test-index" INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection" NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI") CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".") DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
# connections.
def get_collection() -> Any:
from pymongo import MongoClient
@pytest.fixture test_client: MongoClient = MongoClient(CONNECTION_STRING)
def collection() -> Any:
test_client = MongoClient(CONNECTION_STRING)
return test_client[DB_NAME][COLLECTION_NAME] return test_client[DB_NAME][COLLECTION_NAME]
@pytest.fixture()
def collection() -> Any:
return get_collection()
class TestMongoDBAtlasVectorSearch: class TestMongoDBAtlasVectorSearch:
@classmethod @classmethod
def setup_class(cls, collection: Any) -> None: def setup_class(cls) -> None:
# insure the test collection is empty # insure the test collection is empty
collection = get_collection()
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501 assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
@classmethod @classmethod
def teardown_class(cls, collection: Any) -> None: def teardown_class(cls) -> None:
collection = get_collection()
# delete all the documents in the collection # delete all the documents in the collection
collection.delete_many({}) # type: ignore[index] collection.delete_many({}) # type: ignore[index]
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def setup(self, collection: Any) -> None: def setup(self) -> None:
collection = get_collection()
# delete all the documents in the collection # delete all the documents in the collection
collection.delete_many({}) # type: ignore[index] collection.delete_many({}) # type: ignore[index]
def test_from_documents(self, embedding_openai: Embeddings) -> None: def test_from_documents(
self, embedding_openai: Embeddings, collection: Any
) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
documents = [ documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}), Document(page_content="Dogs are tough.", metadata={"a": 1}),
@ -64,7 +69,7 @@ class TestMongoDBAtlasVectorSearch:
assert output[0].page_content == "What is a sandwich?" assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1 assert output[0].metadata["c"] == 1
def test_from_texts(self, embedding_openai: Embeddings) -> None: def test_from_texts(self, embedding_openai: Embeddings, collection: Any) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
"Cats have fluff.", "Cats have fluff.",
@ -81,7 +86,9 @@ class TestMongoDBAtlasVectorSearch:
output = vectorstore.similarity_search("Sandwich", k=1) output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?" assert output[0].page_content == "What is a sandwich?"
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None: def test_from_texts_with_metadatas(
self, embedding_openai: Embeddings, collection: Any
) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
"Cats have fluff.", "Cats have fluff.",
@ -102,7 +109,7 @@ class TestMongoDBAtlasVectorSearch:
assert output[0].metadata["c"] == 1 assert output[0].metadata["c"] == 1
def test_from_texts_with_metadatas_and_pre_filter( def test_from_texts_with_metadatas_and_pre_filter(
self, embedding_openai: Embeddings self, embedding_openai: Embeddings, collection: Any
) -> None: ) -> None:
texts = [ texts = [
"Dogs are tough.", "Dogs are tough.",
@ -124,7 +131,7 @@ class TestMongoDBAtlasVectorSearch:
) )
assert output == [] assert output == []
def test_mmr(self, embedding_openai: Embeddings) -> None: def test_mmr(self, embedding_openai: Embeddings, collection: Any) -> None:
texts = ["foo", "foo", "fou", "foy"] texts = ["foo", "foo", "fou", "foy"]
vectorstore = MongoDBAtlasVectorSearch.from_texts( vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts, texts,