Add support for MongoDB Atlas $vectorSearch vector search (#11139)

Adds support for the `$vectorSearch` operator for
MongoDBAtlasVectorSearch, which was announced at .Local London
(September 26th, 2023). This change maintains breaks compatibility
support for the existing `$search` operator used by the original
integration (https://github.com/langchain-ai/langchain/pull/5338) due to
incompatibilities in the Atlas search implementations.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/10601/head
Noah Stapp 11 months ago committed by GitHub
parent b599f91e33
commit 2c952de21a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -89,6 +89,18 @@ class MongoDBAtlasVectorSearch(VectorStore):
embedding: Embeddings,
**kwargs: Any,
) -> MongoDBAtlasVectorSearch:
"""Construct a `MongoDB Atlas Vector Search` vector store
from a MongoDB connection URI.
Args:
connection_string: A valid MongoDB connection URI.
namespace: A valid MongoDB namespace (database and collection).
embedding: The text embedding model to use for the vector store.
Returns:
A new MongoDBAtlasVectorSearch instance.
"""
try:
from pymongo import MongoClient
except ImportError:
@ -149,24 +161,23 @@ class MongoDBAtlasVectorSearch(VectorStore):
self,
embedding: List[float],
k: int = 4,
pre_filter: Optional[dict] = None,
pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
) -> List[Tuple[Document, float]]:
knn_beta = {
"vector": embedding,
params = {
"queryVector": embedding,
"path": self._embedding_key,
"k": k,
"numCandidates": k * 10,
"limit": k,
"index": self._index_name,
}
if pre_filter:
knn_beta["filter"] = pre_filter
params["filter"] = pre_filter
query = {"$vectorSearch": params}
pipeline = [
{
"$search": {
"index": self._index_name,
"knnBeta": knn_beta,
}
},
{"$set": {"score": {"$meta": "searchScore"}}},
query,
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
]
if post_filter_pipeline is not None:
pipeline.extend(post_filter_pipeline)
@ -183,12 +194,12 @@ class MongoDBAtlasVectorSearch(VectorStore):
query: str,
*,
k: int = 4,
pre_filter: Optional[dict] = None,
pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
) -> List[Tuple[Document, float]]:
"""Return MongoDB documents most similar to query, along with scores.
"""Return MongoDB documents most similar to the given query and their scores.
Use the knnBeta Operator available in MongoDB Atlas Search
Uses the knnBeta Operator available in MongoDB Atlas Search.
This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we
@ -197,14 +208,14 @@ class MongoDBAtlasVectorSearch(VectorStore):
Args:
query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document
fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
following the knnBeta search.
k: (Optional) number of documents to return. Defaults to 4.
pre_filter: (Optional) dictionary of argument(s) to prefilter document
fields on.
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
following the knnBeta vector search.
Returns:
List of Documents most similar to the query and score for each
List of documents most similar to the query and their scores.
"""
embedding = self._embedding.embed_query(query)
docs = self._similarity_search_with_score(
@ -219,29 +230,29 @@ class MongoDBAtlasVectorSearch(VectorStore):
self,
query: str,
k: int = 4,
pre_filter: Optional[dict] = None,
pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return MongoDB documents most similar to query.
"""Return MongoDB documents most similar to the given query.
Use the knnBeta Operator available in MongoDB Atlas Search
Uses the knnBeta Operator available in MongoDB Atlas Search.
This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we may
introduce breaking changes.
early access users. It is not recommended for production deployments as we
may introduce breaking changes.
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
Args:
query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document
fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
following the knnBeta search.
k: (Optional) number of documents to return. Defaults to 4.
pre_filter: (Optional) dictionary of argument(s) to prefilter document
fields on.
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
following the knnBeta vector search.
Returns:
List of Documents most similar to the query and score for each
List of documents most similar to the query and their scores.
"""
docs_and_scores = self.similarity_search_with_score(
query,
@ -257,30 +268,30 @@ class MongoDBAtlasVectorSearch(VectorStore):
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
pre_filter: Optional[dict] = None,
pre_filter: Optional[Dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return docs selected using the maximal marginal relevance.
"""Return documents selected using the maximal marginal relevance.
Maximal marginal relevance optimizes for similarity to query AND diversity
among selected documents.
Args:
query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4.
fetch_k: Optional Number of Documents to fetch before passing to MMR
k: (Optional) number of documents to return. Defaults to 4.
fetch_k: (Optional) number of documents to fetch before passing to MMR
algorithm. Defaults to 20.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
pre_filter: Optional Dictionary of argument(s) to prefilter on document
pre_filter: (Optional) dictionary of argument(s) to prefilter on document
fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
following the knnBeta search.
post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
following the knnBeta vector search.
Returns:
List of Documents selected by maximal marginal relevance.
List of documents selected by maximal marginal relevance.
"""
query_embedding = self._embedding.embed_query(query)
docs = self._similarity_search_with_score(
@ -303,11 +314,11 @@ class MongoDBAtlasVectorSearch(VectorStore):
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metadatas: Optional[List[Dict]] = None,
collection: Optional[Collection[MongoDBDocumentType]] = None,
**kwargs: Any,
) -> MongoDBAtlasVectorSearch:
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents.
"""Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
This is a user-friendly interface that:
1. Embeds documents.

@ -3,7 +3,7 @@ from __future__ import annotations
import os
from time import sleep
from typing import TYPE_CHECKING, Any
from typing import Any
import pytest
@ -11,41 +11,46 @@ from langchain.docstore.document import Document
from langchain.schema.embeddings import Embeddings
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
if TYPE_CHECKING:
from pymongo import MongoClient
INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
# connections.
def get_collection() -> Any:
from pymongo import MongoClient
@pytest.fixture
def collection() -> Any:
test_client = MongoClient(CONNECTION_STRING)
test_client: MongoClient = MongoClient(CONNECTION_STRING)
return test_client[DB_NAME][COLLECTION_NAME]
@pytest.fixture()
def collection() -> Any:
return get_collection()
class TestMongoDBAtlasVectorSearch:
@classmethod
def setup_class(cls, collection: Any) -> None:
def setup_class(cls) -> None:
# insure the test collection is empty
collection = get_collection()
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
@classmethod
def teardown_class(cls, collection: Any) -> None:
def teardown_class(cls) -> None:
collection = get_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
@pytest.fixture(autouse=True)
def setup(self, collection: Any) -> None:
def setup(self) -> None:
collection = get_collection()
# delete all the documents in the collection
collection.delete_many({}) # type: ignore[index]
def test_from_documents(self, embedding_openai: Embeddings) -> None:
def test_from_documents(
self, embedding_openai: Embeddings, collection: Any
) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
@ -64,7 +69,7 @@ class TestMongoDBAtlasVectorSearch:
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
def test_from_texts(self, embedding_openai: Embeddings) -> None:
def test_from_texts(self, embedding_openai: Embeddings, collection: Any) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
@ -81,7 +86,9 @@ class TestMongoDBAtlasVectorSearch:
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
def test_from_texts_with_metadatas(
self, embedding_openai: Embeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
@ -102,7 +109,7 @@ class TestMongoDBAtlasVectorSearch:
assert output[0].metadata["c"] == 1
def test_from_texts_with_metadatas_and_pre_filter(
self, embedding_openai: Embeddings
self, embedding_openai: Embeddings, collection: Any
) -> None:
texts = [
"Dogs are tough.",
@ -124,7 +131,7 @@ class TestMongoDBAtlasVectorSearch:
)
assert output == []
def test_mmr(self, embedding_openai: Embeddings) -> None:
def test_mmr(self, embedding_openai: Embeddings, collection: Any) -> None:
texts = ["foo", "foo", "fou", "foy"]
vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts,

Loading…
Cancel
Save